Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1386502)
+++ conf/hive-default.xml.template (working copy)
@@ -795,6 +795,24 @@
+ hive.auto.sort.merge.join
+ false
+ Will the join be automatically converted to a sort-merge join, if the joined tables pass
+ the criteria for sort-merge join.
+
+
+
+
+ hive.auto.sort.merge.join.bigtbl.matcher
+ org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher
+ The policy to choose the big table for automatic conversion to sort-merge join.
+ By default, the leftmost table is assigned the big table. Other policies are based on size:
+ org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher.
+ New policies can be added in future.
+
+
+
+
hive.metastore.ds.connection.url.hook
Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used
Index: data/files/smallsrcsortbucket4outof4.txt
===================================================================
--- data/files/smallsrcsortbucket4outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket4outof4.txt (working copy)
@@ -0,0 +1,5 @@
+146val_146
+193val_193
+432val_432
+65val_65
+83val_83
Index: data/files/smallsrcsortbucket1outof4.txt
===================================================================
--- data/files/smallsrcsortbucket1outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket1outof4.txt (working copy)
@@ -0,0 +1,5 @@
+0val_0
+103val_103
+169val_169
+172val_172
+374val_374
Index: data/files/smallsrcsortbucket2outof4.txt
===================================================================
--- data/files/smallsrcsortbucket2outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket2outof4.txt (working copy)
@@ -0,0 +1,5 @@
+180val_180
+221val_221
+379val_379
+478val_478
+74val_74
Index: data/files/smallsrcsortbucket3outof4.txt
===================================================================
--- data/files/smallsrcsortbucket3outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket3outof4.txt (working copy)
@@ -0,0 +1,5 @@
+233val_233
+424val_424
+468val_468
+53val_53
+97val_97
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1386502)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -475,6 +475,10 @@
HIVEENFORCESORTMERGEBUCKETMAPJOIN("hive.enforce.sortmergebucketmapjoin", false),
HIVEENFORCEBUCKETMAPJOIN("hive.enforce.bucketmapjoin", false),
+ HIVE_AUTO_SORT_MERGE_JOIN("hive.auto.sort.merge.join", false),
+ HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER("hive.auto.sort.merge.join.bigtbl.matcher",
+ "org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher"),
+
HIVESCRIPTOPERATORTRUST("hive.exec.script.trust", false),
HIVEROWOFFSET("hive.exec.rowoffset", false),
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (working copy)
@@ -0,0 +1,263 @@
+PREHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (working copy)
@@ -0,0 +1,300 @@
+PREHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (working copy)
@@ -0,0 +1,315 @@
+PREHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket
+
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket
+
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (working copy)
@@ -0,0 +1,328 @@
+PREHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (working copy)
@@ -0,0 +1,251 @@
+PREHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (working copy)
@@ -0,0 +1,230 @@
+PREHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: bucket_big
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+19
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (working copy)
@@ -0,0 +1,328 @@
+PREHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (working copy)
@@ -0,0 +1,539 @@
+PREHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (working copy)
@@ -0,0 +1,28 @@
+-- small 2 part, 2 bucket & big 2 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (working copy)
@@ -0,0 +1,26 @@
+-- small 1 part, 2 bucket & big 2 part, 4 bucket
+
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (working copy)
@@ -0,0 +1,23 @@
+-- small 2 part, 2 bucket & big 1 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (working copy)
@@ -0,0 +1,21 @@
+-- small no part, 4 bucket & big no part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small;
+
+CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big;
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big;
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (working copy)
@@ -0,0 +1,28 @@
+-- small 2 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (working copy)
@@ -0,0 +1,29 @@
+-- small 1 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher;
+
+-- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (working copy)
@@ -0,0 +1,25 @@
+-- small 2 part, 4 bucket & big 1 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (working copy)
@@ -0,0 +1,23 @@
+-- small no part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small;
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (working copy)
@@ -0,0 +1,423 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.Order;
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.util.ReflectionUtils;
+
+//try to replace a bucket map join with a sorted merge map join
+abstract public class AbstractSMBJoinProc extends AbstractBucketJoinProc implements NodeProcessor {
+
+ private static final Log LOG = LogFactory
+ .getLog(SortedMergeBucketMapJoinOptimizer.class.getName());
+
+ public AbstractSMBJoinProc(ParseContext pctx) {
+ super(pctx);
+ }
+
+ public AbstractSMBJoinProc() {
+ super();
+ }
+
+ @Override
+ abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException;
+
+ // Return true or false based on whether the mapjoin was converted successfully to
+ // a sort-merge map join operator.
+ protected boolean canConvertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp,
+ Stack stack,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ Object... nodeOutputs) throws SemanticException {
+
+ if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
+ || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
+ return false;
+ }
+
+ boolean tableSorted = true;
+ QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()
+ .get(mapJoinOp);
+ if (joinCxt == null) {
+ return false;
+ }
+ String[] srcs = joinCxt.getBaseSrc();
+ int pos = 0;
+
+ // All the tables/partitions columns should be sorted in the same order
+ // For example, if tables A and B are being joined on columns c1, c2 and c3
+ // which are the sorted and bucketed columns. The join would work, as long
+ // c1, c2 and c3 are sorted in the same order.
+ List sortColumnsFirstTable = new ArrayList();
+
+ for (String src : srcs) {
+ tableSorted = tableSorted
+ && isTableSorted(smbJoinContext,
+ pGraphContext,
+ mapJoinOp.getConf().getKeys().get((byte) pos),
+ joinCxt,
+ src,
+ pos,
+ sortColumnsFirstTable);
+ pos++;
+ }
+ if (!tableSorted) {
+ // this is a mapjoin but not suited for a sort merge bucket map join. check outer joins
+ MapJoinProcessor.checkMapJoin(mapJoinOp.getConf().getPosBigTable(),
+ mapJoinOp.getConf().getConds());
+ return false;
+ }
+
+ smbJoinContext.setSrcs(srcs);
+ return true;
+ }
+
+
+ // Convert the bucket map-join operator to a sort-merge map join operator
+ protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp,
+ SortBucketJoinOptProcCtx smbJoinContext) {
+
+ String[] srcs = smbJoinContext.getSrcs();
+ SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
+ SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
+ smbJop.setConf(smbJoinDesc);
+ HashMap tagToAlias = new HashMap();
+ for (int i = 0; i < srcs.length; i++) {
+ tagToAlias.put((byte) i, srcs[i]);
+ }
+ smbJoinDesc.setTagToAlias(tagToAlias);
+
+ int indexInListMapJoinNoReducer =
+ this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
+ if (indexInListMapJoinNoReducer >= 0 ) {
+ this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
+ this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
+ }
+
+ List> parentOperators = mapJoinOp.getParentOperators();
+ for (int i = 0; i < parentOperators.size(); i++) {
+ Operator extends OperatorDesc> par = parentOperators.get(i);
+ int index = par.getChildOperators().indexOf(mapJoinOp);
+ par.getChildOperators().remove(index);
+ par.getChildOperators().add(index, smbJop);
+ }
+ List> childOps = mapJoinOp.getChildOperators();
+ for (int i = 0; i < childOps.size(); i++) {
+ Operator extends OperatorDesc> child = childOps.get(i);
+ int index = child.getParentOperators().indexOf(mapJoinOp);
+ child.getParentOperators().remove(index);
+ child.getParentOperators().add(index, smbJop);
+ }
+ return smbJop;
+ }
+
+ /**
+ * Whether this table is eligible for a sort-merge join.
+ *
+ * @param pctx parse context
+ * @param op map join operator being considered
+ * @param joinTree join tree being considered
+ * @param alias table alias in the join tree being checked
+ * @param pos position of the table
+ * @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
+ * It is not initialized when pos = 0.
+ * @return
+ * @throws SemanticException
+ */
+ private boolean isTableSorted(
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext pctx,
+ List keys,
+ QBJoinTree joinTree,
+ String alias,
+ int pos,
+ List sortColumnsFirstTable) throws SemanticException {
+
+ HashMap> topOps = this.pGraphContext.getTopOps();
+ Map topToTable = this.pGraphContext
+ .getTopToTable();
+ TableScanOperator tso = (TableScanOperator) topOps.get(alias);
+ if (tso == null) {
+ return false;
+ }
+
+ // get all join columns from join keys
+ List joinCols = new ArrayList();
+ List joinKeys = new ArrayList();
+ joinKeys.addAll(keys);
+ while (joinKeys.size() > 0) {
+ ExprNodeDesc node = joinKeys.remove(0);
+ if (node instanceof ExprNodeColumnDesc) {
+ joinCols.addAll(node.getCols());
+ } else if (node instanceof ExprNodeGenericFuncDesc) {
+ ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
+ GenericUDF udf = udfNode.getGenericUDF();
+ if (!FunctionRegistry.isDeterministic(udf)) {
+ return false;
+ }
+ joinKeys.addAll(0, udfNode.getChildExprs());
+ }
+ }
+
+ Table tbl = topToTable.get(tso);
+ if (tbl.isPartitioned()) {
+ PrunedPartitionList prunedParts = null;
+ try {
+ prunedParts = pGraphContext.getOpToPartList().get(tso);
+ if (prunedParts == null) {
+ prunedParts = PartitionPruner.prune(tbl, pGraphContext
+ .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
+ pGraphContext.getPrunedPartitions());
+ pGraphContext.getOpToPartList().put(tso, prunedParts);
+ }
+ } catch (HiveException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ throw new SemanticException(e.getMessage(), e);
+ }
+ List partitions = prunedParts.getNotDeniedPartns();
+ // Populate the names and order of columns for the first partition of the
+ // first table
+ if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
+ Partition firstPartition = partitions.get(0);
+ sortColumnsFirstTable.addAll(firstPartition.getSortCols());
+ }
+
+ for (Partition partition : prunedParts.getNotDeniedPartns()) {
+ if (!checkSortColsAndJoinCols(partition.getSortCols(),
+ joinCols,
+ sortColumnsFirstTable)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Populate the names and order of columns for the first table
+ if (pos == 0) {
+ sortColumnsFirstTable.addAll(tbl.getSortCols());
+ }
+
+ return checkSortColsAndJoinCols(tbl.getSortCols(),
+ joinCols,
+ sortColumnsFirstTable);
+ }
+
+ private boolean checkSortColsAndJoinCols(List sortCols,
+ List joinCols,
+ List sortColumnsFirstPartition) {
+
+ if (sortCols == null || sortCols.size() != joinCols.size()) {
+ return false;
+ }
+
+ List sortColNames = new ArrayList();
+
+ // The join columns should contain all the sort columns
+ // The sort columns of all the tables should be in the same order
+ // compare the column names and the order with the first table/partition.
+ for (int pos = 0; pos < sortCols.size(); pos++) {
+ Order o = sortCols.get(pos);
+ if (!o.equals(sortColumnsFirstPartition.get(pos))) {
+ return false;
+ }
+ sortColNames.add(sortColumnsFirstPartition.get(pos).getCol());
+ }
+
+ // The column names and order (ascending/descending) matched
+ // The join columns should contain sort columns
+ return sortColNames.containsAll(joinCols);
+ }
+
+ // Can the join operator be converted to a sort-merge join operator ?
+ // It is already verified that the join can be converted to a bucket map join
+ protected boolean checkConvertJoinToSMBJoin(
+ JoinOperator joinOperator,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext pGraphContext) throws SemanticException {
+
+ boolean tableSorted = true;
+ QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOperator);
+
+ if (joinCtx == null) {
+ return false;
+ }
+ String[] srcs = joinCtx.getBaseSrc();
+ int pos = 0;
+
+ // All the tables/partitions columns should be sorted in the same order
+ // For example, if tables A and B are being joined on columns c1, c2 and c3
+ // which are the sorted and bucketed columns. The join would work, as long
+ // c1, c2 and c3 are sorted in the same order.
+ List sortColumnsFirstTable = new ArrayList();
+
+ for (String src : srcs) {
+ tableSorted = tableSorted &&
+ isTableSorted(smbJoinContext,
+ pGraphContext,
+ smbJoinContext.getKeyExprMap().get((byte)pos),
+ joinCtx,
+ src,
+ pos,
+ sortColumnsFirstTable);
+ pos++;
+ }
+
+ smbJoinContext.setSrcs(srcs);
+ return true;
+ }
+
+ // Can the join operator be converted to a sort-merge join operator ?
+ protected boolean canConvertJoinToSMBJoin(
+ JoinOperator joinOperator,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext pGraphContext) throws SemanticException {
+ boolean canConvert =
+ canConvertJoinToBucketMapJoin(
+ joinOperator,
+ pGraphContext,
+ smbJoinContext
+ );
+
+ if (!canConvert) {
+ return false;
+ }
+
+ return checkConvertJoinToSMBJoin(joinOperator, smbJoinContext, pGraphContext);
+ }
+
+ // Can the join operator be converted to a bucket map-merge join operator ?
+ protected boolean canConvertJoinToBucketMapJoin(
+ JoinOperator joinOp,
+ ParseContext pGraphContext,
+ SortBucketJoinOptProcCtx context) throws SemanticException {
+
+ // This has already been inspected and rejected
+ if (context.getListOfRejectedJoins().contains(joinOp)) {
+ return false;
+ }
+
+ QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOp);
+ if (joinCtx == null) {
+ return false;
+ }
+
+ Class extends SortMergeJoinBigTableMatcher> bigTableMatcherClass = null;
+ try {
+ bigTableMatcherClass =
+ (Class extends SortMergeJoinBigTableMatcher>)
+ (Class.forName(HiveConf.getVar(pGraphContext.getConf(),
+ HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER)));
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e.getMessage());
+ }
+
+ SortMergeJoinBigTableMatcher bigTableMatcher =
+ (SortMergeJoinBigTableMatcher) ReflectionUtils.newInstance(bigTableMatcherClass, null);
+ int bigTablePosition =
+ bigTableMatcher.getBigTablePosition(pGraphContext, joinOp);
+ context.setBigTablePosition(bigTablePosition);
+ String joinAlias =
+ bigTablePosition == 0 ?
+ joinCtx.getLeftAlias() : joinCtx.getRightAliases()[bigTablePosition - 1];
+
+ Map> keyExprMap = new HashMap>();
+ List> parentOps = joinOp.getParentOperators();
+ // get the join keys from parent ReduceSink operators
+ for (Operator extends OperatorDesc> parentOp : parentOps) {
+ ReduceSinkDesc rsconf = ((ReduceSinkOperator)parentOp).getConf();
+ Byte tag = (byte) rsconf.getTag();
+ List keys = rsconf.getKeyCols();
+ keyExprMap.put(tag, keys);
+ }
+
+ context.setKeyExprMap(keyExprMap);
+
+ // Given a candidate map-join, can this join be converted.
+ // The candidate map-join was derived from the pluggable sort merge join big
+ // table matcher.
+ return checkConvertBucketMapJoin(
+ pGraphContext,
+ context,
+ joinCtx,
+ keyExprMap,
+ joinAlias,
+ Arrays.asList(joinCtx.getBaseSrc()));
+ }
+
+ // Convert the join operator to a bucket map-join join operator
+ protected MapJoinOperator convertJoinToBucketMapJoin(
+ JoinOperator joinOp,
+ SortBucketJoinOptProcCtx joinContext,
+ ParseContext parseContext) throws SemanticException {
+ MapJoinOperator mapJoinOp = MapJoinProcessor.convertMapJoin(
+ parseContext.getOpParseCtx(),
+ joinOp,
+ pGraphContext.getJoinContext().get(joinOp),
+ joinContext.getBigTablePosition(),
+ false);
+ // Remove the join operator from the query join context
+ parseContext.getJoinContext().remove(joinOp);
+ convertMapJoinToBucketMapJoin(mapJoinOp, joinContext);
+ return mapJoinOp;
+ }
+
+ // Convert the join operator to a sort-merge join operator
+ protected void convertJoinToSMBJoin(
+ JoinOperator joinOp,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext parseContext) throws SemanticException {
+ MapJoinOperator mapJoinOp = convertJoinToBucketMapJoin(joinOp, smbJoinContext, parseContext);
+ convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext);
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (working copy)
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+
+
+public class SortBucketJoinOptProcCtx extends BucketJoinOptProcCtx {
+ private String[] srcs;
+ private int bigTablePosition;
+ private Map> keyExprMap;
+
+ public SortBucketJoinOptProcCtx(HiveConf conf) {
+ super(conf);
+ }
+
+ public String[] getSrcs() {
+ return srcs;
+ }
+
+ public void setSrcs(String[] srcs) {
+ this.srcs = srcs;
+ }
+
+ public int getBigTablePosition() {
+ return bigTablePosition;
+ }
+
+ public void setBigTablePosition(int bigTablePosition) {
+ this.bigTablePosition = bigTablePosition;
+ }
+
+ public Map> getKeyExprMap() {
+ return keyExprMap;
+ }
+
+ public void setKeyExprMap(Map> keyExprMap) {
+ this.keyExprMap = keyExprMap;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (revision 1386502)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (working copy)
@@ -19,22 +19,19 @@
package org.apache.hadoop.hive.ql.optimizer;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.LinkedHashMap;
-import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.ql.ErrorMsg;
-import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
@@ -44,20 +41,9 @@
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
-import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.metadata.Partition;
-import org.apache.hadoop.hive.ql.metadata.Table;
-import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.ParseContext;
-import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
-import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
-import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
//try to replace a bucket map join with a sorted merge map join
public class SortedMergeBucketMapJoinOptimizer implements Transform {
@@ -68,9 +54,38 @@
public SortedMergeBucketMapJoinOptimizer() {
}
+ private void getListOfRejectedJoins(
+ ParseContext pctx, SortBucketJoinOptProcCtx smbJoinContext)
+ throws SemanticException {
+
+ // go through all joins - it should only contain selects, filters and scriptoperators
+ Map opRules = new LinkedHashMap();
+ opRules.put(new RuleRegExp("R1", JoinOperator.getOperatorName() + "%"),
+ getCheckCandidateJoin());
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext);
+ GraphWalker ogw = new DefaultGraphWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.addAll(pctx.getTopOps().values());
+ ogw.startWalking(topNodes, null);
+ }
+
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
+ HiveConf conf = pctx.getConf();
+ SortBucketJoinOptProcCtx smbJoinContext =
+ new SortBucketJoinOptProcCtx(conf);
+ // Get a list of joins which cannot be converted to a sort merge join
+ // Only selects and filters operators are allowed between the table scan and
+ // join currently. More operators can be added - the method supportAutomaticSortMergeJoin
+ // dictates which operator is allowed
+ getListOfRejectedJoins(pctx, smbJoinContext);
+
Map opRules = new LinkedHashMap();
// go through all map joins and find out all which have enabled bucket map
// join.
@@ -78,7 +93,15 @@
getSortedMergeBucketMapjoinProc(pctx));
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
- Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
+
+ // There is no need for the user to specify mapjoin for it to be
+ // converted to sort-merge join
+ if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN)) {
+ opRules.put(new RuleRegExp("R2", "JOIN%"),
+ getSortedMergeJoinProc(pctx));
+ }
+
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext);
GraphWalker ogw = new DefaultGraphWalker(disp);
// Create a list of topop nodes
@@ -89,10 +112,41 @@
return pctx;
}
+ class SortedMergeJoinProc extends AbstractSMBJoinProc implements NodeProcessor {
+
+ public SortedMergeJoinProc(ParseContext pctx) {
+ super(pctx);
+ }
+
+ public SortedMergeJoinProc() {
+ }
+
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+
+ JoinOperator joinOp = (JoinOperator) nd;
+ SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx;
+
+ boolean convert =
+ canConvertJoinToSMBJoin(
+ joinOp, smbJoinContext, pGraphContext);
+
+ if (convert) {
+ convertJoinToSMBJoin(joinOp, smbJoinContext, pGraphContext);
+ }
+ return null;
+ }
+ }
+
private NodeProcessor getSortedMergeBucketMapjoinProc(ParseContext pctx) {
return new SortedMergeBucketMapjoinProc(pctx);
}
+ private NodeProcessor getSortedMergeJoinProc(ParseContext pctx) {
+ return new SortedMergeJoinProc(pctx);
+ }
+
private NodeProcessor getDefaultProc() {
return new NodeProcessor() {
@Override
@@ -104,70 +158,57 @@
};
}
- class SortedMergeBucketMapjoinProc implements NodeProcessor {
- private ParseContext pGraphContext;
+ // check if the join operator encountered is a candidate for being converted
+ // to a sort-merge join
+ private NodeProcessor getCheckCandidateJoin() {
+ return new NodeProcessor() {
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx;
+ JoinOperator joinOperator = (JoinOperator)nd;
+ int size = stack.size();
+ if (!(stack.get(size-1) instanceof JoinOperator) ||
+ !(stack.get(size-2) instanceof ReduceSinkOperator)) {
+ smbJoinContext.getListOfRejectedJoins().add(joinOperator);
+ return null;
+ }
+ // If any operator in the stack does not support a auto-conversion, this join should
+ // not be converted.
+ for (int pos = size -3; pos >= 0; pos--) {
+ Operator extends OperatorDesc> op = (Operator extends OperatorDesc>)stack.get(pos);
+ if (!op.supportAutomaticSortMergeJoin()) {
+ smbJoinContext.getListOfRejectedJoins().add(joinOperator);
+ return null;
+ }
+ }
+
+ return null;
+ }
+ };
+ }
+
+ class SortedMergeBucketMapjoinProc extends AbstractSMBJoinProc implements NodeProcessor {
public SortedMergeBucketMapjoinProc(ParseContext pctx) {
- this.pGraphContext = pctx;
+ super(pctx);
}
public SortedMergeBucketMapjoinProc() {
}
- // Return true or false based on whether the mapjoin was converted successfully to
- // a sort-merge map join operator.
- private boolean convertSMBJoin(Node nd, Stack stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
if (nd instanceof SMBMapJoinOperator) {
- return false;
+ return null;
}
+
MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
- if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
- || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
- return false;
- }
+ SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx;
- boolean tableSorted = true;
- QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()
- .get(mapJoinOp);
- if (joinCxt == null) {
- return false;
- }
- String[] srcs = joinCxt.getBaseSrc();
- int pos = 0;
-
- // All the tables/partitions columns should be sorted in the same order
- // For example, if tables A and B are being joined on columns c1, c2 and c3
- // which are the sorted and bucketed columns. The join would work, as long
- // c1, c2 and c3 are sorted in the same order.
- List sortColumnsFirstTable = new ArrayList();
-
- for (String src : srcs) {
- tableSorted = tableSorted
- && isTableSorted(this.pGraphContext,
- mapJoinOp,
- joinCxt,
- src,
- pos,
- sortColumnsFirstTable);
- pos++;
- }
- if (!tableSorted) {
- //this is a mapjoin but not suit for a sort merge bucket map join. check outer joins
- MapJoinProcessor.checkMapJoin(((MapJoinOperator) nd).getConf().getPosBigTable(),
- ((MapJoinOperator) nd).getConf().getConds());
- return false;
- }
- // convert a bucket map join operator to a sorted merge bucket map join
- // operator
- convertToSMBJoin(mapJoinOp, srcs);
- return true;
- }
-
- @Override
- public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- boolean convert = convertSMBJoin(nd, stack, procCtx, nodeOutputs);
+ boolean convert =
+ canConvertBucketMapJoinToSMBJoin(mapJoinOp, stack, smbJoinContext, nodeOutputs);
// Throw an error if the user asked for sort merge bucketed mapjoin to be enforced
// and sort merge bucketed mapjoin cannot be performed
if (!convert &&
@@ -176,160 +217,10 @@
throw new SemanticException(ErrorMsg.SORTMERGE_MAPJOIN_FAILED.getMsg());
}
+ if (convert) {
+ convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext);
+ }
return null;
}
-
- private SMBMapJoinOperator convertToSMBJoin(MapJoinOperator mapJoinOp,
- String[] srcs) {
- SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
- SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
- smbJop.setConf(smbJoinDesc);
- HashMap tagToAlias = new HashMap();
- for (int i = 0; i < srcs.length; i++) {
- tagToAlias.put((byte) i, srcs[i]);
- }
- smbJoinDesc.setTagToAlias(tagToAlias);
-
- int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
- if(indexInListMapJoinNoReducer >= 0 ) {
- this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
- this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
- }
-
- List extends Operator> parentOperators = mapJoinOp.getParentOperators();
- for (int i = 0; i < parentOperators.size(); i++) {
- Operator par = parentOperators.get(i);
- int index = par.getChildOperators().indexOf(mapJoinOp);
- par.getChildOperators().remove(index);
- par.getChildOperators().add(index, smbJop);
- }
- List extends Operator> childOps = mapJoinOp.getChildOperators();
- for (int i = 0; i < childOps.size(); i++) {
- Operator child = childOps.get(i);
- int index = child.getParentOperators().indexOf(mapJoinOp);
- child.getParentOperators().remove(index);
- child.getParentOperators().add(index, smbJop);
- }
- return smbJop;
- }
-
- /**
- * Whether this table is eligible for a sort-merge join.
- *
- * @param pctx parse context
- * @param op map join operator being considered
- * @param joinTree join tree being considered
- * @param alias table alias in the join tree being checked
- * @param pos position of the table
- * @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
- * It is not initialized when pos = 0.
- * @return
- * @throws SemanticException
- */
- private boolean isTableSorted(ParseContext pctx,
- MapJoinOperator op,
- QBJoinTree joinTree,
- String alias,
- int pos,
- List sortColumnsFirstTable)
- throws SemanticException {
-
- Map> topOps = this.pGraphContext
- .getTopOps();
- Map topToTable = this.pGraphContext
- .getTopToTable();
- TableScanOperator tso = (TableScanOperator) topOps.get(alias);
- if (tso == null) {
- return false;
- }
-
- List keys = op.getConf().getKeys().get((byte) pos);
- // get all join columns from join keys stored in MapJoinDesc
- List joinCols = new ArrayList();
- List joinKeys = new ArrayList();
- joinKeys.addAll(keys);
- while (joinKeys.size() > 0) {
- ExprNodeDesc node = joinKeys.remove(0);
- if (node instanceof ExprNodeColumnDesc) {
- joinCols.addAll(node.getCols());
- } else if (node instanceof ExprNodeGenericFuncDesc) {
- ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
- GenericUDF udf = udfNode.getGenericUDF();
- if (!FunctionRegistry.isDeterministic(udf)) {
- return false;
- }
- joinKeys.addAll(0, udfNode.getChildExprs());
- }
- }
-
- Table tbl = topToTable.get(tso);
- if (tbl.isPartitioned()) {
- PrunedPartitionList prunedParts = null;
- try {
- prunedParts = pGraphContext.getOpToPartList().get(tso);
- if (prunedParts == null) {
- prunedParts = PartitionPruner.prune(tbl, pGraphContext
- .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
- pGraphContext.getPrunedPartitions());
- pGraphContext.getOpToPartList().put(tso, prunedParts);
- }
- } catch (HiveException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- throw new SemanticException(e.getMessage(), e);
- }
- List partitions = prunedParts.getNotDeniedPartns();
- // Populate the names and order of columns for the first partition of the
- // first table
- if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
- Partition firstPartition = partitions.get(0);
- sortColumnsFirstTable.addAll(firstPartition.getSortCols());
- }
-
- for (Partition partition : prunedParts.getNotDeniedPartns()) {
- if (!checkSortColsAndJoinCols(partition.getSortCols(),
- joinCols,
- sortColumnsFirstTable)) {
- return false;
- }
- }
- return true;
- }
-
- // Populate the names and order of columns for the first table
- if (pos == 0) {
- sortColumnsFirstTable.addAll(tbl.getSortCols());
- }
-
- return checkSortColsAndJoinCols(tbl.getSortCols(),
- joinCols,
- sortColumnsFirstTable);
- }
-
- private boolean checkSortColsAndJoinCols(List sortCols,
- List joinCols,
- List sortColumnsFirstPartition) {
-
- if (sortCols == null || sortCols.size() != joinCols.size()) {
- return false;
- }
-
- List sortColNames = new ArrayList();
-
- // The join columns should contain all the sort columns
- // The sort columns of all the tables should be in the same order
- // compare the column names and the order with the first table/partition.
- for (int pos = 0; pos < sortCols.size(); pos++) {
- Order o = sortCols.get(pos);
- if (!o.equals(sortColumnsFirstPartition.get(pos))) {
- return false;
- }
- sortColNames.add(sortColumnsFirstPartition.get(pos).getCol());
- }
-
- // The column names and order (ascending/descending) matched
- // The join columns should contain sort columns
- return sortColNames.containsAll(joinCols);
- }
}
-
}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+
+/*
+ * This is a pluggable policy to chose the candidate map-join table for converting a join to a
+ * sort merge join. The leftmost table is chosen as the join table.
+ */
+public class LeftSortMergeJoinBigTableMatcher implements SortMergeJoinBigTableMatcher {
+ public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) {
+ return 0;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1386502)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy)
@@ -63,12 +63,15 @@
}
transformations.add(new SamplePruner());
transformations.add(new MapJoinProcessor());
- if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) {
+ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN) ||
+ HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) {
transformations.add(new BucketMapJoinOptimizer());
- if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) {
- transformations.add(new SortedMergeBucketMapJoinOptimizer());
- }
}
+
+ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) {
+ transformations.add(new SortedMergeBucketMapJoinOptimizer());
+ }
+
transformations.add(new UnionProcessor());
transformations.add(new JoinReorder());
if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) {
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (working copy)
@@ -0,0 +1,146 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+
+public class BucketJoinOptProcCtx implements NodeProcessorCtx {
+ private static final Log LOG =
+ LogFactory.getLog(BucketJoinOptProcCtx.class.getName());
+
+ private final HiveConf conf;
+
+ // we only convert map joins that follows a root table scan in the same
+ // mapper. That means there is no reducer between the root table scan and
+ // mapjoin.
+ private Set listOfRejectedMapjoins = new HashSet();
+ private Set listOfRejectedJoins = new HashSet();
+
+ // The list of join operators which can be converted to a bucketed map join
+ private Set listOfConvertedJoins = new HashSet();
+
+ private Map> aliasToPartitionBucketNumberMapping;
+ private Map>> aliasToPartitionBucketFileNamesMapping;
+ private Map> bigTblPartsToBucketFileNames;
+ private Map bigTblPartsToBucketNumber;
+ private List joinAliases;
+ private String baseBigAlias;
+ private boolean bigTablePartitioned;
+
+ public BucketJoinOptProcCtx(HiveConf conf) {
+ this.conf = conf;
+ }
+
+ public HiveConf getConf() {
+ return conf;
+ }
+
+ public Set getListOfRejectedMapjoins() {
+ return listOfRejectedMapjoins;
+ }
+
+ public void setListOfRejectedMapjoins(Set listOfRejectedMapjoins) {
+ this.listOfRejectedMapjoins = listOfRejectedMapjoins;
+ }
+
+ public Set getListOfRejectedJoins() {
+ return listOfRejectedJoins;
+ }
+
+ public Set getListOfConvertedJoins() {
+ return listOfConvertedJoins;
+ }
+
+ public void setListOfRejectedJoins(Set listOfRejectedJoins) {
+ this.listOfRejectedJoins = listOfRejectedJoins;
+ }
+
+ public void setListOfConvertedJoins(Set listOfConvertedJoins) {
+ this.listOfConvertedJoins = listOfConvertedJoins;
+ }
+
+ public Map> getAliasToPartitionBucketNumberMapping() {
+ return aliasToPartitionBucketNumberMapping;
+ }
+
+ public Map>> getAliasToPartitionBucketFileNamesMapping() {
+ return aliasToPartitionBucketFileNamesMapping;
+ }
+
+ public Map> getBigTblPartsToBucketFileNames() {
+ return bigTblPartsToBucketFileNames;
+ }
+
+ public Map getBigTblPartsToBucketNumber() {
+ return bigTblPartsToBucketNumber;
+ }
+
+ public void setAliasToPartitionBucketNumberMapping(
+ Map> aliasToPartitionBucketNumberMapping) {
+ this.aliasToPartitionBucketNumberMapping = aliasToPartitionBucketNumberMapping;
+ }
+
+ public void setAliasToPartitionBucketFileNamesMapping(
+ Map>> aliasToPartitionBucketFileNamesMapping) {
+ this.aliasToPartitionBucketFileNamesMapping = aliasToPartitionBucketFileNamesMapping;
+ }
+
+ public void setBigTblPartsToBucketFileNames(
+ Map> bigTblPartsToBucketFileNames) {
+ this.bigTblPartsToBucketFileNames = bigTblPartsToBucketFileNames;
+ }
+
+ public void setBigTblPartsToBucketNumber(Map bigTblPartsToBucketNumber) {
+ this.bigTblPartsToBucketNumber = bigTblPartsToBucketNumber;
+ }
+
+ public void setJoinAliases(List joinAliases) {
+ this.joinAliases = joinAliases;
+ }
+
+ public void setBaseBigAlias(String baseBigAlias) {
+ this.baseBigAlias = baseBigAlias;
+ }
+
+ public List getJoinAliases() {
+ return joinAliases;
+ }
+
+ public String getBaseBigAlias() {
+ return baseBigAlias;
+ }
+
+ public boolean isBigTablePartitioned() {
+ return bigTablePartitioned;
+ }
+
+ public void setBigTablePartitioned(boolean bigTablePartitioned) {
+ this.bigTablePartitioned = bigTablePartitioned;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+public interface SortMergeJoinBigTableMatcher {
+ public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp)
+ throws SemanticException;
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1386502)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy)
@@ -81,8 +81,8 @@
public ParseContext transform(ParseContext pctx) throws SemanticException {
Map opRules = new LinkedHashMap();
- BucketMapjoinOptProcCtx bucketMapJoinOptimizeCtx =
- new BucketMapjoinOptProcCtx(pctx.getConf());
+ BucketJoinOptProcCtx bucketMapJoinOptimizeCtx =
+ new BucketJoinOptProcCtx(pctx.getConf());
// process map joins with no reducers pattern
opRules.put(new RuleRegExp("R1",
@@ -116,11 +116,11 @@
return new NodeProcessor () {
@Override
public Object process(Node nd, Stack stack,
- NodeProcessorCtx procCtx, Object... nodeOutputs)
+ NodeProcessorCtx procCtx, Object... nodeOutputs)
throws SemanticException {
MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
- BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
- context.listOfRejectedMapjoins.add(mapJoinOp);
+ BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx;
+ context.getListOfRejectedMapjoins().add(mapJoinOp);
return null;
}
};
@@ -130,6 +130,10 @@
return new BucketMapjoinOptProc(pctx);
}
+ private NodeProcessor getBucketjoinProc(ParseContext pctx) {
+ return new BucketMapjoinOptProc(pctx);
+ }
+
private NodeProcessor getDefaultProc() {
return new NodeProcessor() {
@Override
@@ -141,228 +145,22 @@
};
}
- class BucketMapjoinOptProc implements NodeProcessor {
+ class BucketMapjoinOptProc extends AbstractBucketJoinProc implements NodeProcessor {
- protected ParseContext pGraphContext;
-
public BucketMapjoinOptProc(ParseContext pGraphContext) {
- super();
- this.pGraphContext = pGraphContext;
+ super(pGraphContext);
}
- private boolean convertBucketMapJoin(Node nd, Stack stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
- BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
- HiveConf conf = context.getConf();
-
- if(context.getListOfRejectedMapjoins().contains(mapJoinOp)) {
- return false;
- }
-
- QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
- if(joinCxt == null) {
- return false;
- }
-
- List joinAliases = new ArrayList();
- String[] srcs = joinCxt.getBaseSrc();
- String[] left = joinCxt.getLeftAliases();
- List mapAlias = joinCxt.getMapAliases();
- String baseBigAlias = null;
- for(String s : left) {
- if(s != null && !joinAliases.contains(s)) {
- joinAliases.add(s);
- if(!mapAlias.contains(s)) {
- baseBigAlias = s;
- }
- }
- }
- for(String s : srcs) {
- if(s != null && !joinAliases.contains(s)) {
- joinAliases.add(s);
- if(!mapAlias.contains(s)) {
- baseBigAlias = s;
- }
- }
- }
-
- MapJoinDesc mjDesc = mapJoinOp.getConf();
- LinkedHashMap> aliasToPartitionBucketNumberMapping =
- new LinkedHashMap>();
- LinkedHashMap>> aliasToPartitionBucketFileNamesMapping =
- new LinkedHashMap>>();
-
- Map> topOps =
- this.pGraphContext.getTopOps();
- Map topToTable = this.pGraphContext.getTopToTable();
-
- // (partition to bucket file names) and (partition to bucket number) for
- // the big table;
- LinkedHashMap> bigTblPartsToBucketFileNames = new LinkedHashMap>();
- LinkedHashMap bigTblPartsToBucketNumber = new LinkedHashMap();
-
- Integer[] orders = null; // accessing order of join cols to bucket cols, should be same
- boolean bigTablePartitioned = true;
- for (int index = 0; index < joinAliases.size(); index++) {
- String alias = joinAliases.get(index);
- TableScanOperator tso = (TableScanOperator) topOps.get(alias);
- if (tso == null) {
- return false;
- }
- List keys = toColumns(mjDesc.getKeys().get((byte) index));
- if (keys == null || keys.isEmpty()) {
- return false;
- }
- if (orders == null) {
- orders = new Integer[keys.size()];
- }
-
- Table tbl = topToTable.get(tso);
- if(tbl.isPartitioned()) {
- PrunedPartitionList prunedParts;
- try {
- prunedParts = pGraphContext.getOpToPartList().get(tso);
- if (prunedParts == null) {
- prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
- pGraphContext.getPrunedPartitions());
- pGraphContext.getOpToPartList().put(tso, prunedParts);
- }
- } catch (HiveException e) {
- // Has to use full name to make sure it does not conflict with
- // org.apache.commons.lang.StringUtils
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- throw new SemanticException(e.getMessage(), e);
- }
- List partitions = prunedParts.getNotDeniedPartns();
- // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
- if (partitions.isEmpty()) {
- if (!alias.equals(baseBigAlias)) {
- aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList());
- aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>());
- }
- } else {
- List buckets = new ArrayList();
- List> files = new ArrayList>();
- for (Partition p : partitions) {
- if (!checkBucketColumns(p.getBucketCols(), keys, orders)) {
- return false;
- }
- List fileNames = getOnePartitionBucketFileNames(p.getDataLocation());
- // The number of files for the table should be same as number of buckets.
- int bucketCount = p.getBucketCount();
- if (fileNames.size() != bucketCount) {
- String msg = "The number of buckets for table " +
- tbl.getTableName() + " partition " + p.getName() + " is " +
- p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
- throw new SemanticException(
- ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
- }
- if (alias.equals(baseBigAlias)) {
- bigTblPartsToBucketFileNames.put(p, fileNames);
- bigTblPartsToBucketNumber.put(p, bucketCount);
- } else {
- files.add(fileNames);
- buckets.add(bucketCount);
- }
- }
- if (!alias.equals(baseBigAlias)) {
- aliasToPartitionBucketNumberMapping.put(alias, buckets);
- aliasToPartitionBucketFileNamesMapping.put(alias, files);
- }
- }
- } else {
- if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) {
- return false;
- }
- List fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation());
- Integer num = new Integer(tbl.getNumBuckets());
- // The number of files for the table should be same as number of buckets.
- if (fileNames.size() != num) {
- String msg = "The number of buckets for table " +
- tbl.getTableName() + " is " + tbl.getNumBuckets() +
- ", whereas the number of files is " + fileNames.size();
- throw new SemanticException(
- ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
- }
- if (alias.equals(baseBigAlias)) {
- bigTblPartsToBucketFileNames.put(null, fileNames);
- bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
- bigTablePartitioned = false;
- } else {
- aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num));
- aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames));
- }
- }
- }
-
- // All tables or partitions are bucketed, and their bucket number is
- // stored in 'bucketNumbers', we need to check if the number of buckets in
- // the big table can be divided by no of buckets in small tables.
- for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) {
- if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) {
- return false;
- }
- }
-
- MapJoinDesc desc = mapJoinOp.getConf();
-
- Map>> aliasBucketFileNameMapping =
- new LinkedHashMap>>();
-
- //sort bucket names for the big table
- for(List partBucketNames : bigTblPartsToBucketFileNames.values()) {
- Collections.sort(partBucketNames);
- }
-
- // go through all small tables and get the mapping from bucket file name
- // in the big table to bucket file names in small tables.
- for (int j = 0; j < joinAliases.size(); j++) {
- String alias = joinAliases.get(j);
- if (alias.equals(baseBigAlias)) {
- continue;
- }
- for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) {
- Collections.sort(names);
- }
- List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias);
- List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias);
-
- Map> mapping = new LinkedHashMap>();
- aliasBucketFileNameMapping.put(alias, mapping);
-
- // for each bucket file in big table, get the corresponding bucket file
- // name in the small table.
- //more than 1 partition in the big table, do the mapping for each partition
- Iterator>> bigTblPartToBucketNames =
- bigTblPartsToBucketFileNames.entrySet().iterator();
- Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber
- .entrySet().iterator();
- while (bigTblPartToBucketNames.hasNext()) {
- assert bigTblPartToBucketNum.hasNext();
- int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
- List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
- fillMapping(smallTblBucketNums, smallTblFilesList,
- mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping());
- }
- }
- desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
- desc.setBigTableAlias(baseBigAlias);
- if (bigTablePartitioned) {
- desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
- }
-
- return true;
- }
-
-
@Override
@SuppressWarnings("unchecked")
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
+ BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx;
+ MapJoinOperator mapJoinOperator = (MapJoinOperator)nd;
- boolean convert = convertBucketMapJoin(nd, stack, procCtx, nodeOutputs);
- BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
+ // can the mapjoin present be converted to a bucketed mapjoin
+ boolean convert = canConvertMapJoinToBucketMapJoin(
+ mapJoinOperator, pGraphContext, context);
HiveConf conf = context.getConf();
// Throw an error if the user asked for bucketed mapjoin to be enforced and
@@ -371,130 +169,13 @@
throw new SemanticException(ErrorMsg.BUCKET_MAPJOIN_NOT_POSSIBLE.getMsg());
}
- return null;
- }
-
- private List toColumns(List keys) {
- List columns = new ArrayList();
- for (ExprNodeDesc key : keys) {
- if (!(key instanceof ExprNodeColumnDesc)) {
- return null;
- }
- columns.add(((ExprNodeColumnDesc) key).getColumn());
+ if (convert) {
+ // convert the mapjoin to a bucketized mapjoin
+ convertMapJoinToBucketMapJoin(mapJoinOperator, context);
}
- return columns;
- }
- // convert partition to partition spec string
- private Map> convert(Map> mapping) {
- Map> converted = new HashMap>();
- for (Map.Entry> entry : mapping.entrySet()) {
- converted.put(entry.getKey().getName(), entry.getValue());
- }
- return converted;
+ return null;
}
-
- // called for each partition of big table and populates mapping for each file in the partition
- private void fillMapping(
- List smallTblBucketNums,
- List> smallTblFilesList,
- Map> mapping,
- int bigTblBucketNum, List bigTblBucketNameList,
- Map bucketFileNameMapping) {
-
- for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) {
- ArrayList resultFileNames = new ArrayList();
- for (int sindex = 0 ; sindex < smallTblBucketNums.size(); sindex++) {
- int smallTblBucketNum = smallTblBucketNums.get(sindex);
- List smallTblFileNames = smallTblFilesList.get(sindex);
- if (bigTblBucketNum >= smallTblBucketNum) {
- // if the big table has more buckets than the current small table,
- // use "MOD" to get small table bucket names. For example, if the big
- // table has 4 buckets and the small table has 2 buckets, then the
- // mapping should be 0->0, 1->1, 2->0, 3->1.
- int toAddSmallIndex = bindex % smallTblBucketNum;
- resultFileNames.add(smallTblFileNames.get(toAddSmallIndex));
- } else {
- int jump = smallTblBucketNum / bigTblBucketNum;
- for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) {
- resultFileNames.add(smallTblFileNames.get(i));
- }
- }
- }
- String inputBigTBLBucket = bigTblBucketNameList.get(bindex);
- mapping.put(inputBigTBLBucket, resultFileNames);
- bucketFileNameMapping.put(inputBigTBLBucket, bindex);
- }
- }
-
- private boolean checkBucketNumberAgainstBigTable(
- Map> aliasToBucketNumber, int bucketNumberInPart) {
- for (List bucketNums : aliasToBucketNumber.values()) {
- for (int nxt : bucketNums) {
- boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0
- : bucketNumberInPart % nxt == 0;
- if (!ok) {
- return false;
- }
- }
- }
- return true;
- }
-
- private List getOnePartitionBucketFileNames(URI location)
- throws SemanticException {
- List fileNames = new ArrayList();
- try {
- FileSystem fs = FileSystem.get(location, this.pGraphContext.getConf());
- FileStatus[] files = fs.listStatus(new Path(location.toString()));
- if (files != null) {
- for (FileStatus file : files) {
- fileNames.add(file.getPath().toString());
- }
- }
- } catch (IOException e) {
- throw new SemanticException(e);
- }
- return fileNames;
- }
-
- private boolean checkBucketColumns(List bucketColumns, List keys,
- Integer[] orders) {
- if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) {
- return false;
- }
- for (int i = 0; i < keys.size(); i++) {
- int index = bucketColumns.indexOf(keys.get(i));
- if (orders[i] != null && orders[i] != index) {
- return false;
- }
- orders[i] = index;
- }
- // Check if the join columns contains all bucket columns.
- // If a table is bucketized on column B, but the join key is A and B,
- // it is easy to see joining on different buckets yield empty results.
- return keys.containsAll(bucketColumns);
- }
}
- class BucketMapjoinOptProcCtx implements NodeProcessorCtx {
- private final HiveConf conf;
-
- // we only convert map joins that follows a root table scan in the same
- // mapper. That means there is no reducer between the root table scan and
- // mapjoin.
- Set listOfRejectedMapjoins = new HashSet();
-
- public BucketMapjoinOptProcCtx(HiveConf conf) {
- this.conf = conf;
- }
-
- public HiveConf getConf() {
- return conf;
- }
-
- public Set getListOfRejectedMapjoins() {
- return listOfRejectedMapjoins;
- }
- }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (working copy)
@@ -0,0 +1,442 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+
+/**
+ *this transformation does bucket map join optimization.
+ */
+abstract public class AbstractBucketJoinProc implements NodeProcessor {
+ private static final Log LOG =
+ LogFactory.getLog(AbstractBucketJoinProc.class.getName());
+
+ protected ParseContext pGraphContext;
+
+ public AbstractBucketJoinProc(ParseContext pGraphContext) {
+ this.pGraphContext = pGraphContext;
+ }
+
+ public AbstractBucketJoinProc() {
+ }
+
+ @Override
+ abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException;
+
+ private static List getOnePartitionBucketFileNames(
+ URI location, ParseContext pGraphContext) throws SemanticException {
+ List fileNames = new ArrayList();
+ try {
+ FileSystem fs = FileSystem.get(location, pGraphContext.getConf());
+ FileStatus[] files = fs.listStatus(new Path(location.toString()));
+ if (files != null) {
+ for (FileStatus file : files) {
+ fileNames.add(file.getPath().toString());
+ }
+ }
+ } catch (IOException e) {
+ throw new SemanticException(e);
+ }
+ return fileNames;
+ }
+
+ private boolean checkBucketColumns(List bucketColumns,
+ List keys,
+ Integer[] orders) {
+ if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) {
+ return false;
+ }
+ for (int i = 0; i < keys.size(); i++) {
+ int index = bucketColumns.indexOf(keys.get(i));
+ if (orders[i] != null && orders[i] != index) {
+ return false;
+ }
+ orders[i] = index;
+ }
+
+ // Check if the join columns contains all bucket columns.
+ // If a table is bucketized on column B, but the join key is A and B,
+ // it is easy to see joining on different buckets yield empty results.
+ return keys.containsAll(bucketColumns);
+ }
+
+ private boolean checkBucketNumberAgainstBigTable(
+ Map> aliasToBucketNumber, int bucketNumberInPart) {
+ for (List bucketNums : aliasToBucketNumber.values()) {
+ for (int nxt : bucketNums) {
+ boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0
+ : bucketNumberInPart % nxt == 0;
+ if (!ok) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ protected boolean canConvertMapJoinToBucketMapJoin(
+ MapJoinOperator mapJoinOp,
+ ParseContext pGraphContext,
+ BucketJoinOptProcCtx context) throws SemanticException {
+
+ if (context.getListOfRejectedMapjoins().contains(mapJoinOp)) {
+ return false;
+ }
+
+ QBJoinTree joinCtx = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
+ if (joinCtx == null) {
+ return false;
+ }
+
+ List joinAliases = new ArrayList();
+ String[] srcs = joinCtx.getBaseSrc();
+ String[] left = joinCtx.getLeftAliases();
+ List mapAlias = joinCtx.getMapAliases();
+ String baseBigAlias = null;
+ for (String s : left) {
+ if (s != null && !joinAliases.contains(s)) {
+ joinAliases.add(s);
+ if (!mapAlias.contains(s)) {
+ baseBigAlias = s;
+ }
+ }
+ }
+
+ for (String s : srcs) {
+ if (s != null && !joinAliases.contains(s)) {
+ joinAliases.add(s);
+ if (!mapAlias.contains(s)) {
+ baseBigAlias = s;
+ }
+ }
+ }
+
+ Map> keysMap = mapJoinOp.getConf().getKeys();
+
+ return checkConvertBucketMapJoin(
+ pGraphContext,
+ context,
+ joinCtx,
+ keysMap,
+ baseBigAlias,
+ joinAliases);
+ }
+
+ protected boolean checkConvertBucketMapJoin(
+ ParseContext pGraphContext,
+ BucketJoinOptProcCtx context,
+ QBJoinTree joinCtx,
+ Map> keysMap,
+ String baseBigAlias,
+ List joinAliases) throws SemanticException {
+
+ Map> aliasToPartitionBucketNumberMapping =
+ new LinkedHashMap>();
+ Map>> aliasToPartitionBucketFileNamesMapping =
+ new LinkedHashMap>>();
+
+ HashMap> topOps = pGraphContext.getTopOps();
+ Map topToTable = pGraphContext.getTopToTable();
+
+ // (partition to bucket file names) and (partition to bucket number) for
+ // the big table;
+ Map> bigTblPartsToBucketFileNames =
+ new LinkedHashMap>();
+ Map bigTblPartsToBucketNumber =
+ new LinkedHashMap();
+
+ Integer[] orders = null; // accessing order of join cols to bucket cols, should be same
+ boolean bigTablePartitioned = true;
+ for (int index = 0; index < joinAliases.size(); index++) {
+ String alias = joinAliases.get(index);
+ TableScanOperator tso = (TableScanOperator) topOps.get(alias);
+ if (tso == null) {
+ return false;
+ }
+ Table tbl = topToTable.get(tso);
+
+ List keys = toColumns(keysMap.get((byte) index));
+ if (keys == null || keys.isEmpty()) {
+ return false;
+ }
+ if (orders == null) {
+ orders = new Integer[keys.size()];
+ }
+
+ if (tbl.isPartitioned()) {
+ PrunedPartitionList prunedParts;
+ try {
+ prunedParts = pGraphContext.getOpToPartList().get(tso);
+ if (prunedParts == null) {
+ prunedParts =
+ PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso),
+ pGraphContext.getConf(), alias,
+ pGraphContext.getPrunedPartitions());
+ pGraphContext.getOpToPartList().put(tso, prunedParts);
+ }
+ } catch (HiveException e) {
+ // Has to use full name to make sure it does not conflict with
+ // org.apache.commons.lang.StringUtils
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ throw new SemanticException(e.getMessage(), e);
+ }
+ List partitions = prunedParts.getNotDeniedPartns();
+ // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
+ if (partitions.isEmpty()) {
+ if (!alias.equals(baseBigAlias)) {
+ aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList());
+ aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>());
+ }
+ } else {
+ List buckets = new ArrayList();
+ List> files = new ArrayList>();
+ for (Partition p : partitions) {
+ if (!checkBucketColumns(p.getBucketCols(), keys, orders)) {
+ return false;
+ }
+ List fileNames =
+ getOnePartitionBucketFileNames(p.getDataLocation(), pGraphContext);
+ // The number of files for the table should be same as number of buckets.
+ int bucketCount = p.getBucketCount();
+
+ if (fileNames.size() != bucketCount) {
+ String msg = "The number of buckets for table " +
+ tbl.getTableName() + " partition " + p.getName() + " is " +
+ p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
+ throw new SemanticException(
+ ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
+ }
+
+ if (alias.equals(baseBigAlias)) {
+ bigTblPartsToBucketFileNames.put(p, fileNames);
+ bigTblPartsToBucketNumber.put(p, bucketCount);
+ } else {
+ files.add(fileNames);
+ buckets.add(bucketCount);
+ }
+ }
+ if (!alias.equals(baseBigAlias)) {
+ aliasToPartitionBucketNumberMapping.put(alias, buckets);
+ aliasToPartitionBucketFileNamesMapping.put(alias, files);
+ }
+ }
+ } else {
+ if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) {
+ return false;
+ }
+ List fileNames =
+ getOnePartitionBucketFileNames(tbl.getDataLocation(), pGraphContext);
+ Integer num = new Integer(tbl.getNumBuckets());
+
+ // The number of files for the table should be same as number of buckets.
+ if (fileNames.size() != num) {
+ String msg = "The number of buckets for table " +
+ tbl.getTableName() + " is " + tbl.getNumBuckets() +
+ ", whereas the number of files is " + fileNames.size();
+ throw new SemanticException(
+ ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
+ }
+
+ if (alias.equals(baseBigAlias)) {
+ bigTblPartsToBucketFileNames.put(null, fileNames);
+ bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
+ bigTablePartitioned = false;
+ } else {
+ aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num));
+ aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames));
+ }
+ }
+ }
+
+ // All tables or partitions are bucketed, and their bucket number is
+ // stored in 'bucketNumbers', we need to check if the number of buckets in
+ // the big table can be divided by no of buckets in small tables.
+ for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) {
+ if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) {
+ return false;
+ }
+ }
+
+ context.setAliasToPartitionBucketNumberMapping(aliasToPartitionBucketNumberMapping);
+ context.setAliasToPartitionBucketFileNamesMapping(aliasToPartitionBucketFileNamesMapping);
+ context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames);
+ context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber);
+ context.setJoinAliases(joinAliases);
+ context.setBaseBigAlias(baseBigAlias);
+ context.setBigTablePartitioned(bigTablePartitioned);
+
+ return true;
+ }
+
+ protected void convertMapJoinToBucketMapJoin(
+ MapJoinOperator mapJoinOp,
+ BucketJoinOptProcCtx context) throws SemanticException {
+ MapJoinDesc desc = mapJoinOp.getConf();
+
+ Map>> aliasBucketFileNameMapping =
+ new LinkedHashMap>>();
+
+ Map> aliasToPartitionBucketNumberMapping =
+ context.getAliasToPartitionBucketNumberMapping();
+
+ Map>> aliasToPartitionBucketFileNamesMapping =
+ context.getAliasToPartitionBucketFileNamesMapping();
+
+ Map> bigTblPartsToBucketFileNames =
+ context.getBigTblPartsToBucketFileNames();
+
+ Map bigTblPartsToBucketNumber =
+ context.getBigTblPartsToBucketNumber();
+
+ List joinAliases = context.getJoinAliases();
+ String baseBigAlias = context.getBaseBigAlias();
+
+ // sort bucket names for the big table
+ for (List partBucketNames : bigTblPartsToBucketFileNames.values()) {
+ Collections.sort(partBucketNames);
+ }
+
+ // go through all small tables and get the mapping from bucket file name
+ // in the big table to bucket file names in small tables.
+ for (int j = 0; j < joinAliases.size(); j++) {
+ String alias = joinAliases.get(j);
+ if (alias.equals(baseBigAlias)) {
+ continue;
+ }
+ for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) {
+ Collections.sort(names);
+ }
+ List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias);
+ List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias);
+
+ Map> mapping = new LinkedHashMap>();
+ aliasBucketFileNameMapping.put(alias, mapping);
+
+ // for each bucket file in big table, get the corresponding bucket file
+ // name in the small table.
+ // more than 1 partition in the big table, do the mapping for each partition
+ Iterator>> bigTblPartToBucketNames =
+ bigTblPartsToBucketFileNames.entrySet().iterator();
+ Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber
+ .entrySet().iterator();
+ while (bigTblPartToBucketNames.hasNext()) {
+ assert bigTblPartToBucketNum.hasNext();
+ int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
+ List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
+ fillMapping(smallTblBucketNums, smallTblFilesList,
+ mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping());
+ }
+ }
+ desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
+ desc.setBigTableAlias(baseBigAlias);
+ boolean bigTablePartitioned = context.isBigTablePartitioned();
+ if (bigTablePartitioned) {
+ desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
+ }
+ }
+
+ // convert partition to partition spec string
+ private static Map> convert(Map> mapping) {
+ Map> converted = new HashMap>();
+ for (Map.Entry> entry : mapping.entrySet()) {
+ converted.put(entry.getKey().getName(), entry.getValue());
+ }
+ return converted;
+ }
+
+ private List toColumns(List keys) {
+ List columns = new ArrayList();
+ for (ExprNodeDesc key : keys) {
+ if (!(key instanceof ExprNodeColumnDesc)) {
+ return null;
+ }
+ columns.add(((ExprNodeColumnDesc) key).getColumn());
+ }
+ return columns;
+ }
+
+ // called for each partition of big table and populates mapping for each file in the partition
+ private static void fillMapping(
+ List smallTblBucketNums,
+ List> smallTblFilesList,
+ Map> mapping,
+ int bigTblBucketNum, List bigTblBucketNameList,
+ Map bucketFileNameMapping) {
+
+ for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) {
+ ArrayList resultFileNames = new ArrayList();
+ for (int sindex = 0 ; sindex < smallTblBucketNums.size(); sindex++) {
+ int smallTblBucketNum = smallTblBucketNums.get(sindex);
+ List smallTblFileNames = smallTblFilesList.get(sindex);
+ if (bigTblBucketNum >= smallTblBucketNum) {
+ // if the big table has more buckets than the current small table,
+ // use "MOD" to get small table bucket names. For example, if the big
+ // table has 4 buckets and the small table has 2 buckets, then the
+ // mapping should be 0->0, 1->1, 2->0, 3->1.
+ int toAddSmallIndex = bindex % smallTblBucketNum;
+ resultFileNames.add(smallTblFileNames.get(toAddSmallIndex));
+ } else {
+ int jump = smallTblBucketNum / bigTblBucketNum;
+ for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) {
+ resultFileNames.add(smallTblFileNames.get(i));
+ }
+ }
+ }
+ String inputBigTBLBucket = bigTblBucketNameList.get(bindex);
+ mapping.put(inputBigTBLBucket, resultFileNames);
+ bucketFileNameMapping.put(inputBigTBLBucket, bindex);
+ }
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+
+/*
+ * This is a pluggable policy to chose the candidate map-join table for converting a join to a
+ * sort merge join. The largest table is chosen based on the size of the tables.
+ */
+public class SizeSortMergeJoinBigTableMatcher implements SortMergeJoinBigTableMatcher {
+ private void getListTopOps(
+ Operator extends OperatorDesc> op, List topOps) {
+ if ((op.getParentOperators() == null) ||
+ (op.getParentOperators().isEmpty())) {
+ return;
+ }
+
+ for (Operator extends OperatorDesc> parentOp : op.getParentOperators()) {
+ if (parentOp instanceof TableScanOperator) {
+ topOps.add((TableScanOperator)parentOp);
+ }
+ else {
+ getListTopOps(parentOp, topOps);
+ }
+ }
+ }
+
+ private long getSize(HiveConf conf, String size, Path path) {
+ // If the size is present in the metastore, use it
+ if (size != null) {
+ try {
+ return Long.valueOf(size);
+ } catch (NumberFormatException e) {
+ return 0;
+ }
+ }
+
+ try {
+ FileSystem fs = path.getFileSystem(conf);
+ return fs.getContentSummary(path).getLength();
+ } catch (Exception e) {
+ return 0;
+ }
+ }
+
+ private long getSize(HiveConf conf, Table table) {
+ Path path = table.getPath();
+ String size = table.getProperty("totalSize");
+ return getSize(conf, size, path);
+ }
+
+ private long getSize(HiveConf conf, Partition partition) {
+ Path path = partition.getPartitionPath();
+ String size = partition.getParameters().get("totalSize");
+
+ return getSize(conf, size, path);
+ }
+
+ public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp)
+ throws SemanticException {
+ int bigTablePos = 0;
+ long maxSize = 0;
+ HiveConf conf = parseCtx.getConf();
+
+ try {
+ List topOps = new ArrayList();
+ getListTopOps(joinOp, topOps);
+ int currentPos = 0;
+ for (TableScanOperator topOp : topOps) {
+ Table table = parseCtx.getTopToTable().get(topOp);
+ long currentSize = 0;
+
+ if (!table.isPartitioned()) {
+ currentSize = getSize(conf, table);
+ }
+ else {
+ // For partitioned tables, get the size of all the partitions
+ PrunedPartitionList partsList =
+ PartitionPruner.prune(parseCtx.getTopToTable().get(topOp),
+ parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(),
+ null, parseCtx.getPrunedPartitions());
+ for (Partition part : partsList.getNotDeniedPartns()) {
+ currentSize += getSize(conf, part);
+ }
+ }
+
+ if (currentSize > maxSize) {
+ maxSize = currentSize;
+ bigTablePos = currentPos;
+ }
+ currentPos++;
+ }
+ } catch (HiveException e) {
+ throw new SemanticException(e.getMessage());
+ }
+
+ return bigTablePos;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1386502)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy)
@@ -160,4 +160,9 @@
public OperatorType getType() {
return OperatorType.FILTER;
}
+
+ @Override
+ public boolean supportAutomaticSortMergeJoin() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1386502)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy)
@@ -100,4 +100,9 @@
public OperatorType getType() {
return OperatorType.SELECT;
}
+
+ @Override
+ public boolean supportAutomaticSortMergeJoin() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1386502)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy)
@@ -278,4 +278,9 @@
}
}
}
+
+ @Override
+ public boolean supportAutomaticSortMergeJoin() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1386502)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy)
@@ -1359,4 +1359,13 @@
return ret;
}
+
+ /**
+ * Whether this operator supports automatic sort merge join.
+ * The stack is traversed, and this method is invoked for all the operators.
+ * @return TRUE if yes, FALSE otherwise.
+ */
+ public boolean supportAutomaticSortMergeJoin() {
+ return false;
+ }
}