Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1432851)
+++ conf/hive-default.xml.template (working copy)
@@ -916,6 +916,24 @@
+ hive.auto.sort.merge.join
+ false
+ Will the join be automatically converted to a sort-merge join, if the joined tables pass
+ the criteria for sort-merge join.
+
+
+
+
+ hive.auto.sort.merge.join.bigtbl.matcher
+ org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher
+ The policy to choose the big table for automatic conversion to sort-merge join.
+ By default, the leftmost table is assigned the big table. Other policies are based on size:
+ org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher.
+ New policies can be added in future.
+
+
+
+
hive.metastore.ds.connection.url.hook
Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used
Index: build.properties
===================================================================
--- build.properties (revision 1432851)
+++ build.properties (working copy)
@@ -79,7 +79,7 @@
# (measured in milliseconds). Ignored if fork is disabled. When running
# multiple tests inside the same Java VM (see forkMode), timeout
# applies to the time that all tests use together, not to an individual test.
-test.junit.timeout=43200000
+test.junit.timeout=432000000
# Use this property to selectively disable tests from the command line:
# ant test -Dtest.junit.exclude="**/TestCliDriver.class"
Index: data/files/smallsrcsortbucket4outof4.txt
===================================================================
--- data/files/smallsrcsortbucket4outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket4outof4.txt (working copy)
@@ -0,0 +1,5 @@
+146val_146
+193val_193
+432val_432
+65val_65
+83val_83
Index: data/files/smallsrcsortbucket1outof4.txt
===================================================================
--- data/files/smallsrcsortbucket1outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket1outof4.txt (working copy)
@@ -0,0 +1,5 @@
+0val_0
+103val_103
+169val_169
+172val_172
+374val_374
Index: data/files/smallsrcsortbucket2outof4.txt
===================================================================
--- data/files/smallsrcsortbucket2outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket2outof4.txt (working copy)
@@ -0,0 +1,5 @@
+180val_180
+221val_221
+379val_379
+478val_478
+74val_74
Index: data/files/smallsrcsortbucket3outof4.txt
===================================================================
--- data/files/smallsrcsortbucket3outof4.txt (revision 0)
+++ data/files/smallsrcsortbucket3outof4.txt (working copy)
@@ -0,0 +1,5 @@
+233val_233
+424val_424
+468val_468
+53val_53
+97val_97
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1432851)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -498,6 +498,10 @@
HIVEENFORCESORTMERGEBUCKETMAPJOIN("hive.enforce.sortmergebucketmapjoin", false),
HIVEENFORCEBUCKETMAPJOIN("hive.enforce.bucketmapjoin", false),
+ HIVE_AUTO_SORT_MERGE_JOIN("hive.auto.sort.merge.join", false),
+ HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER("hive.auto.sort.merge.join.bigtbl.matcher",
+ "org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher"),
+
HIVESCRIPTOPERATORTRUST("hive.exec.script.trust", false),
HIVEROWOFFSET("hive.exec.rowoffset", false),
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (working copy)
@@ -0,0 +1,761 @@
+PREHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1
+ Stage-7 has a backup stage: Stage-1
+ Stage-4 depends on stages: Stage-7
+ Stage-2 depends on stages: Stage-1, Stage-4, Stage-5
+ Stage-8 has a backup stage: Stage-1
+ Stage-5 depends on stages: Stage-8
+ Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-6
+ Conditional Operator
+
+ Stage: Stage-7
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ b
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Truncated Path -> Alias:
+ /bucket_small [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-8
+ Map Reduce Local Work
+ Alias -> Map Local Tables:
+ a
+ Fetch Operator
+ limit: -1
+ Alias -> Map Local Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_small
+ numFiles 4
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_small { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 226
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_small
+ name: default.bucket_small
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [b]
+ /bucket_big/ds=2008-04-09 [b]
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [b]
+ /bucket_big/ds=2008-04-09 [b]
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+38
+PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage)
+-- so, the attempt to convert to map-join should fail.
+
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+Execution failed with exit status: 3
+Obtaining error information
+
+Task failed!
+Task ID:
+ Stage-8
+
+Logs:
+
+#### A masked pattern was here ####
+FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask
+ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask
+POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage)
+-- so, the attempt to convert to map-join should fail.
+
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (working copy)
@@ -0,0 +1,569 @@
+PREHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket
+
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket
+
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [b]
+ /bucket_big/ds=2008-04-09 [b]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (working copy)
@@ -0,0 +1,418 @@
+PREHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: bucket_big
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big [b]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+19
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: bucket_big
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_small
+#### A masked pattern was here ####
+19
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (working copy)
@@ -0,0 +1,465 @@
+PREHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [b]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (working copy)
@@ -0,0 +1,584 @@
+PREHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [b]
+ /bucket_big/ds=2008-04-09 [b]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 8
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11624
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (working copy)
@@ -0,0 +1,453 @@
+PREHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [b]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 1
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+38
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (working copy)
@@ -0,0 +1,584 @@
+PREHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-09
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [b]
+ /bucket_big/ds=2008-04-09 [b]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
+PREHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+PREHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+POSTHOOK: Input: default@bucket_small@ds=2008-04-09
+#### A masked pattern was here ####
+76
Index: ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out
===================================================================
--- ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (revision 0)
+++ ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (working copy)
@@ -0,0 +1,310 @@
+PREHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_small
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_small@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_small@ds=2008-04-08
+PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucket_big
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-08
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-08
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@bucket_big@ds=2008-04-09
+POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@bucket_big@ds=2008-04-09
+PREHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ GatherStats: false
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ keys:
+ 0 [Column[key]]
+ 1 [Column[key]]
+ Position of Big Table: 0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-08
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+#### A masked pattern was here ####
+ Partition
+ base file name: ds=2008-04-09
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-09
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 2
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ SORTBUCKETCOLSPREFIX TRUE
+ bucket_count 2
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.bucket_big
+ numFiles 4
+ numPartitions 2
+ numRows 0
+ partition_columns ds
+ rawDataSize 0
+ serialization.ddl struct bucket_big { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5500
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucket_big
+ name: default.bucket_big
+ Truncated Path -> Alias:
+ /bucket_big/ds=2008-04-08 [a]
+ /bucket_big/ds=2008-04-09 [a]
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count()
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10002
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns
+ columns.types
+ escape.delim \
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0
+ columns.types bigint
+ escape.delim \
+ serialization.format 1
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucket_big
+PREHOOK: Input: default@bucket_big@ds=2008-04-08
+PREHOOK: Input: default@bucket_big@ds=2008-04-09
+PREHOOK: Input: default@bucket_small
+PREHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucket_big
+POSTHOOK: Input: default@bucket_big@ds=2008-04-08
+POSTHOOK: Input: default@bucket_big@ds=2008-04-09
+POSTHOOK: Input: default@bucket_small
+POSTHOOK: Input: default@bucket_small@ds=2008-04-08
+#### A masked pattern was here ####
+38
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (working copy)
@@ -0,0 +1,26 @@
+-- small 2 part, 2 bucket & big 1 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (working copy)
@@ -0,0 +1,31 @@
+-- small 2 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (working copy)
@@ -0,0 +1,28 @@
+-- small 2 part, 4 bucket & big 1 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (working copy)
@@ -0,0 +1,31 @@
+-- small 2 part, 2 bucket & big 2 part, 4 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (working copy)
@@ -0,0 +1,29 @@
+-- small 1 part, 2 bucket & big 2 part, 4 bucket
+
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (working copy)
@@ -0,0 +1,23 @@
+-- small no part, 4 bucket & big no part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small;
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small;
+
+CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big;
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big;
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeSortMergeJoinBigTableMatcher;
+
+-- Since size is being used to find the big table, the order of the tables in the join does not matter
+explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key;
+
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q
===================================================================
--- ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (revision 0)
+++ ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (working copy)
@@ -0,0 +1,23 @@
+-- small 1 part, 4 bucket & big 2 part, 2 bucket
+CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08');
+
+CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08');
+
+load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09');
+
+set hive.auto.sort.merge.join=true;
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+
+set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher;
+
+-- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly
+explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
+select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+/*
+ * This is a pluggable policy to choose the candidate map-join table for converting a join to a
+ * sort merge join. The largest table is chosen based on the size of the tables.
+ */
+public class AvgPartitionSizeSortMergeJoinBigTableMatcher
+ extends CommonSizeSortMergeJoinBigTableMatcher
+ implements SortMergeJoinBigTableMatcher {
+
+ private static final Log LOG = LogFactory
+ .getLog(AvgPartitionSizeSortMergeJoinBigTableMatcher.class.getName());
+
+ public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp)
+ throws SemanticException {
+ int bigTablePos = 0;
+ long maxSize = 0;
+ int numPartitionsCurrentBigTable = 0; // number of partitions for the chosen big table
+ HiveConf conf = parseCtx.getConf();
+
+ try {
+ List topOps = new ArrayList();
+ getListTopOps(joinOp, topOps);
+ int currentPos = 0;
+ for (TableScanOperator topOp : topOps) {
+ int numPartitions = 1; // in case the sizes match, preference is
+ // given to the table with fewer partitions
+ Table table = parseCtx.getTopToTable().get(topOp);
+ long averageSize = 0;
+
+ if (!table.isPartitioned()) {
+ averageSize = getSize(conf, table);
+ }
+ else {
+ // For partitioned tables, get the size of all the partitions
+ PrunedPartitionList partsList =
+ PartitionPruner.prune(parseCtx.getTopToTable().get(topOp),
+ parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(),
+ null, parseCtx.getPrunedPartitions());
+ numPartitions = partsList.getNotDeniedPartns().size();
+ long totalSize = 0;
+ for (Partition part : partsList.getNotDeniedPartns()) {
+ totalSize += getSize(conf, part);
+ }
+ averageSize = totalSize/numPartitions;
+ }
+
+ if (averageSize > maxSize) {
+ maxSize = averageSize;
+ bigTablePos = currentPos;
+ numPartitionsCurrentBigTable = numPartitions;
+ }
+ // If the sizes match, prefer the table with fewer partitions
+ else if (averageSize == maxSize) {
+ if (numPartitions < numPartitionsCurrentBigTable) {
+ bigTablePos = currentPos;
+ numPartitionsCurrentBigTable = numPartitions;
+ }
+ }
+
+ currentPos++;
+ }
+ } catch (HiveException e) {
+ throw new SemanticException(e.getMessage());
+ }
+
+ return bigTablePos;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (working copy)
@@ -0,0 +1,542 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.Order;
+import org.apache.hadoop.hive.ql.exec.DummyStoreOperator;
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.QB;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.util.ReflectionUtils;
+
+//try to replace a bucket map join with a sorted merge map join
+abstract public class AbstractSMBJoinProc extends AbstractBucketJoinProc implements NodeProcessor {
+
+ private static final Log LOG = LogFactory
+ .getLog(SortedMergeBucketMapJoinOptimizer.class.getName());
+
+ public AbstractSMBJoinProc(ParseContext pctx) {
+ super(pctx);
+ }
+
+ public AbstractSMBJoinProc() {
+ super();
+ }
+
+ @Override
+ abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException;
+
+ // Return true or false based on whether the mapjoin was converted successfully to
+ // a sort-merge map join operator.
+ protected boolean canConvertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp,
+ Stack stack,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ Object... nodeOutputs) throws SemanticException {
+
+ if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
+ || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
+ return false;
+ }
+
+ boolean tableSorted = true;
+ QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()
+ .get(mapJoinOp);
+ if (joinCxt == null) {
+ return false;
+ }
+ String[] srcs = joinCxt.getBaseSrc();
+ for (int srcPos = 0; srcPos < srcs.length; srcPos++) {
+ srcs[srcPos] = QB.getAppendedAliasFromId(joinCxt.getId(), srcs[srcPos]);
+ }
+
+ // All the tables/partitions columns should be sorted in the same order
+ // For example, if tables A and B are being joined on columns c1, c2 and c3
+ // which are the sorted and bucketed columns. The join would work, as long
+ // c1, c2 and c3 are sorted in the same order.
+ List sortColumnsFirstTable = new ArrayList();
+
+ for (int pos = 0; pos < srcs.length; pos++) {
+ tableSorted = tableSorted
+ && isTableSorted(smbJoinContext,
+ pGraphContext,
+ mapJoinOp.getConf().getKeys().get((byte) pos),
+ joinCxt,
+ srcs,
+ pos,
+ sortColumnsFirstTable);
+ }
+ if (!tableSorted) {
+ // this is a mapjoin but not suited for a sort merge bucket map join. check outer joins
+ MapJoinProcessor.checkMapJoin(mapJoinOp.getConf().getPosBigTable(),
+ mapJoinOp.getConf().getConds());
+ return false;
+ }
+
+ smbJoinContext.setSrcs(srcs);
+ return true;
+ }
+
+
+ // Convert the bucket map-join operator to a sort-merge map join operator
+ protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext parseContext) {
+
+ String[] srcs = smbJoinContext.getSrcs();
+ SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
+ SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
+ smbJop.setConf(smbJoinDesc);
+ HashMap tagToAlias = new HashMap();
+ for (int i = 0; i < srcs.length; i++) {
+ tagToAlias.put((byte) i, srcs[i]);
+ }
+ smbJoinDesc.setTagToAlias(tagToAlias);
+
+ int indexInListMapJoinNoReducer =
+ this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
+ if (indexInListMapJoinNoReducer >= 0 ) {
+ this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
+ this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
+ }
+
+ Map aliasToSink =
+ new HashMap();
+ // For all parents (other than the big table), insert a dummy store operator
+ /* Consider a query like:
+ *
+ * select * from
+ * (subq1 --> has a filter)
+ * join
+ * (subq2 --> has a filter)
+ * on some key
+ *
+ * Let us assume that subq1 is the small table (either specified by the user or inferred
+ * automatically). The following operator tree will be created:
+ *
+ * TableScan (subq1) --> Select --> Filter --> DummyStore
+ * \
+ * \ SMBJoin
+ * /
+ * /
+ * TableScan (subq2) --> Select --> Filter
+ */
+
+ List> parentOperators = mapJoinOp.getParentOperators();
+ for (int i = 0; i < parentOperators.size(); i++) {
+ Operator extends OperatorDesc> par = parentOperators.get(i);
+ int index = par.getChildOperators().indexOf(mapJoinOp);
+ par.getChildOperators().remove(index);
+ if (i == smbJoinDesc.getPosBigTable()) {
+ par.getChildOperators().add(index, smbJop);
+ }
+ else {
+ DummyStoreOperator dummyStoreOp = new DummyStoreOperator();
+ par.getChildOperators().add(index, dummyStoreOp);
+
+ List> childrenOps =
+ new ArrayList>();
+ childrenOps.add(smbJop);
+ dummyStoreOp.setChildOperators(childrenOps);
+
+ List> parentOps =
+ new ArrayList>();
+ parentOps.add(par);
+ dummyStoreOp.setParentOperators(parentOps);
+
+ aliasToSink.put(srcs[i], dummyStoreOp);
+ smbJop.getParentOperators().remove(i);
+ smbJop.getParentOperators().add(i, dummyStoreOp);
+ }
+ }
+ smbJoinDesc.setAliasToSink(aliasToSink);
+
+ List> childOps = mapJoinOp.getChildOperators();
+ for (int i = 0; i < childOps.size(); i++) {
+ Operator extends OperatorDesc> child = childOps.get(i);
+ int index = child.getParentOperators().indexOf(mapJoinOp);
+ child.getParentOperators().remove(index);
+ child.getParentOperators().add(index, smbJop);
+ }
+ parseContext.getSmbMapJoinContext().put(smbJop,
+ parseContext.getMapJoinContext().get(mapJoinOp));
+ parseContext.getMapJoinContext().remove(mapJoinOp);
+ parseContext.getOpParseCtx().put(smbJop, parseContext.getOpParseCtx().get(mapJoinOp));
+
+ return smbJop;
+ }
+
+ /**
+ * Whether this table is eligible for a sort-merge join.
+ *
+ * @param pctx parse context
+ * @param op map join operator being considered
+ * @param joinTree join tree being considered
+ * @param alias table alias in the join tree being checked
+ * @param pos position of the table
+ * @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
+ * It is not initialized when pos = 0.
+ * @return
+ * @throws SemanticException
+ */
+ private boolean isTableSorted(
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext pctx,
+ List keys,
+ QBJoinTree joinTree,
+ String[] aliases,
+ int pos,
+ List sortColumnsFirstTable) throws SemanticException {
+ String alias = aliases[pos];
+ Map topToTable = this.pGraphContext
+ .getTopToTable();
+
+
+ /*
+ * Consider a query like:
+ *
+ * select -- mapjoin(subq1) -- * from
+ * (select a.key, a.value from tbl1 a) subq1
+ * join
+ * (select a.key, a.value from tbl2 a) subq2
+ * on subq1.key = subq2.key;
+ *
+ * aliasToOpInfo contains the SelectOperator for subq1 and subq2.
+ * We need to traverse the tree (using TableAccessAnalyzer) to get to the base
+ * table. If the object being map-joined is a base table, then aliasToOpInfo
+ * contains the TableScanOperator, and TableAccessAnalyzer is a no-op.
+ */
+ Operator extends OperatorDesc> topOp = joinTree.getAliasToOpInfo().get(alias);
+ if (topOp == null) {
+ return false;
+ }
+
+ // get all join columns from join keys
+ List joinCols = new ArrayList();
+ List joinKeys = new ArrayList();
+ joinKeys.addAll(keys);
+ while (joinKeys.size() > 0) {
+ ExprNodeDesc node = joinKeys.remove(0);
+ if (node instanceof ExprNodeColumnDesc) {
+ joinCols.addAll(node.getCols());
+ } else if (node instanceof ExprNodeGenericFuncDesc) {
+ ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
+ GenericUDF udf = udfNode.getGenericUDF();
+ if (!FunctionRegistry.isDeterministic(udf)) {
+ return false;
+ }
+ joinKeys.addAll(0, udfNode.getChildExprs());
+ }
+ }
+
+ if (joinCols == null || joinCols.isEmpty()) {
+ return false;
+ }
+ TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols);
+ if (tso == null) {
+ return false;
+ }
+
+ // For nested sub-queries, the alias mapping is not maintained in QB currently.
+ /*
+ * Consider a query like:
+ *
+ * select count(*) from
+ * (
+ * select key, count(*) from
+ * (
+ * select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2
+ * from tbl1 a join tbl2 b on a.key = b.key
+ * ) subq1
+ * group by key
+ * ) subq2;
+ *
+ * The table alias should be subq2:subq1:a which needs to be fetched from topOps.
+ */
+ if (pGraphContext.getTopOps().containsValue(tso)) {
+ for (Map.Entry> topOpEntry :
+ this.pGraphContext.getTopOps().entrySet()) {
+ if (topOpEntry.getValue() == tso) {
+ alias = topOpEntry.getKey();
+ aliases[pos] = alias;
+ break;
+ }
+ }
+ }
+ else {
+ // Ideally, this should never happen, and this should be an assert.
+ return false;
+ }
+
+ Table tbl = topToTable.get(tso);
+ if (tbl.isPartitioned()) {
+ PrunedPartitionList prunedParts = null;
+ try {
+ prunedParts = pGraphContext.getOpToPartList().get(tso);
+ if (prunedParts == null) {
+ prunedParts = PartitionPruner.prune(tbl, pGraphContext
+ .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
+ pGraphContext.getPrunedPartitions());
+ pGraphContext.getOpToPartList().put(tso, prunedParts);
+ }
+ } catch (HiveException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ throw new SemanticException(e.getMessage(), e);
+ }
+ List partitions = prunedParts.getNotDeniedPartns();
+ // Populate the names and order of columns for the first partition of the
+ // first table
+ if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
+ Partition firstPartition = partitions.get(0);
+ sortColumnsFirstTable.addAll(firstPartition.getSortCols());
+ }
+
+ for (Partition partition : prunedParts.getNotDeniedPartns()) {
+ if (!checkSortColsAndJoinCols(partition.getSortCols(),
+ joinCols,
+ sortColumnsFirstTable)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Populate the names and order of columns for the first table
+ if (pos == 0) {
+ sortColumnsFirstTable.addAll(tbl.getSortCols());
+ }
+
+ return checkSortColsAndJoinCols(tbl.getSortCols(),
+ joinCols,
+ sortColumnsFirstTable);
+ }
+
+ private boolean checkSortColsAndJoinCols(List sortCols,
+ List joinCols,
+ List sortColumnsFirstPartition) {
+
+ if (sortCols == null || sortCols.size() < joinCols.size()) {
+ return false;
+ }
+
+ // A join is eligible for a sort-merge join, only if it is eligible for
+ // a bucketized map join. So, we dont need to check for bucketized map
+ // join here. We are guaranteed that the join keys contain all the
+ // bucketized keys (note that the order need not be the same).
+ List sortColNames = new ArrayList();
+
+ // The join columns should contain all the sort columns
+ // The sort columns of all the tables should be in the same order
+ // compare the column names and the order with the first table/partition.
+ for (int pos = 0; pos < sortCols.size(); pos++) {
+ Order o = sortCols.get(pos);
+
+ if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) {
+ return false;
+ }
+ sortColNames.add(o.getCol());
+ }
+
+ // The column names and order (ascending/descending) matched
+ // The first 'n' sorted columns should be the same as the joinCols, where
+ // 'n' is the size of join columns.
+ // For eg: if the table is sorted by (a,b,c), it is OK to convert if the join is
+ // on (a), (a,b), or any combination of (a,b,c):
+ // (a,b,c), (a,c,b), (c,a,b), (c,b,a), (b,c,a), (b,a,c)
+ // but it is not OK to convert if the join is on (a,c)
+ return sortColNames.subList(0, joinCols.size()).containsAll(joinCols);
+ }
+
+ // Can the join operator be converted to a sort-merge join operator ?
+ // It is already verified that the join can be converted to a bucket map join
+ protected boolean checkConvertJoinToSMBJoin(
+ JoinOperator joinOperator,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext pGraphContext) throws SemanticException {
+
+ boolean tableSorted = true;
+ QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOperator);
+
+ if (joinCtx == null) {
+ return false;
+ }
+ String[] srcs = joinCtx.getBaseSrc();
+
+ // All the tables/partitions columns should be sorted in the same order
+ // For example, if tables A and B are being joined on columns c1, c2 and c3
+ // which are the sorted and bucketed columns. The join would work, as long
+ // c1, c2 and c3 are sorted in the same order.
+ List sortColumnsFirstTable = new ArrayList();
+
+ for (int pos = 0; pos < srcs.length; pos++) {
+ tableSorted = tableSorted &&
+ isTableSorted(smbJoinContext,
+ pGraphContext,
+ smbJoinContext.getKeyExprMap().get((byte)pos),
+ joinCtx,
+ srcs,
+ pos,
+ sortColumnsFirstTable);
+ }
+
+ smbJoinContext.setSrcs(srcs);
+ return true;
+ }
+
+ // Can the join operator be converted to a sort-merge join operator ?
+ protected boolean canConvertJoinToSMBJoin(
+ JoinOperator joinOperator,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext pGraphContext) throws SemanticException {
+ boolean canConvert =
+ canConvertJoinToBucketMapJoin(
+ joinOperator,
+ pGraphContext,
+ smbJoinContext
+ );
+
+ if (!canConvert) {
+ return false;
+ }
+
+ return checkConvertJoinToSMBJoin(joinOperator, smbJoinContext, pGraphContext);
+ }
+
+ // Can the join operator be converted to a bucket map-merge join operator ?
+ protected boolean canConvertJoinToBucketMapJoin(
+ JoinOperator joinOp,
+ ParseContext pGraphContext,
+ SortBucketJoinOptProcCtx context) throws SemanticException {
+
+ // This has already been inspected and rejected
+ if (context.getSetOfRejectedJoins().contains(joinOp)) {
+ return false;
+ }
+
+ QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOp);
+ if (joinCtx == null) {
+ return false;
+ }
+
+ Class extends SortMergeJoinBigTableMatcher> bigTableMatcherClass = null;
+ try {
+ bigTableMatcherClass =
+ (Class extends SortMergeJoinBigTableMatcher>)
+ (Class.forName(HiveConf.getVar(pGraphContext.getConf(),
+ HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER)));
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e.getMessage());
+ }
+
+ SortMergeJoinBigTableMatcher bigTableMatcher =
+ (SortMergeJoinBigTableMatcher) ReflectionUtils.newInstance(bigTableMatcherClass, null);
+ int bigTablePosition =
+ bigTableMatcher.getBigTablePosition(pGraphContext, joinOp);
+ context.setBigTablePosition(bigTablePosition);
+ String joinAlias =
+ bigTablePosition == 0 ?
+ joinCtx.getLeftAlias() : joinCtx.getRightAliases()[bigTablePosition - 1];
+
+ Map> keyExprMap = new HashMap>();
+ List> parentOps = joinOp.getParentOperators();
+ // get the join keys from parent ReduceSink operators
+ for (Operator extends OperatorDesc> parentOp : parentOps) {
+ ReduceSinkDesc rsconf = ((ReduceSinkOperator)parentOp).getConf();
+ Byte tag = (byte) rsconf.getTag();
+ List keys = rsconf.getKeyCols();
+ keyExprMap.put(tag, keys);
+ }
+
+ context.setKeyExprMap(keyExprMap);
+
+ // Given a candidate map-join, can this join be converted.
+ // The candidate map-join was derived from the pluggable sort merge join big
+ // table matcher.
+ return checkConvertBucketMapJoin(
+ pGraphContext,
+ context,
+ joinCtx,
+ keyExprMap,
+ joinAlias,
+ Arrays.asList(joinCtx.getBaseSrc()));
+ }
+
+ // Convert the join operator to a bucket map-join join operator
+ protected MapJoinOperator convertJoinToBucketMapJoin(
+ JoinOperator joinOp,
+ SortBucketJoinOptProcCtx joinContext,
+ ParseContext parseContext) throws SemanticException {
+ MapJoinOperator mapJoinOp = MapJoinProcessor.convertMapJoin(
+ parseContext.getOpParseCtx(),
+ joinOp,
+ pGraphContext.getJoinContext().get(joinOp),
+ joinContext.getBigTablePosition(),
+ false);
+ // Remove the join operator from the query join context
+ parseContext.getMapJoinContext().put(mapJoinOp, parseContext.getJoinContext().get(joinOp));
+ parseContext.getJoinContext().remove(joinOp);
+ convertMapJoinToBucketMapJoin(mapJoinOp, joinContext);
+ return mapJoinOp;
+ }
+
+ // Convert the join operator to a sort-merge join operator
+ protected void convertJoinToSMBJoin(
+ JoinOperator joinOp,
+ SortBucketJoinOptProcCtx smbJoinContext,
+ ParseContext parseContext) throws SemanticException {
+ MapJoinOperator mapJoinOp = convertJoinToBucketMapJoin(joinOp, smbJoinContext, parseContext);
+ SMBMapJoinOperator smbMapJoinOp =
+ convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext, parseContext);
+ smbMapJoinOp.setConvertedAutomaticallySMBJoin(true);
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (working copy)
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+
+
+public class SortBucketJoinOptProcCtx extends BucketJoinOptProcCtx {
+ private String[] srcs;
+ private int bigTablePosition;
+ private Map> keyExprMap;
+
+ public SortBucketJoinOptProcCtx(HiveConf conf) {
+ super(conf);
+ }
+
+ public String[] getSrcs() {
+ return srcs;
+ }
+
+ public void setSrcs(String[] srcs) {
+ this.srcs = srcs;
+ }
+
+ public int getBigTablePosition() {
+ return bigTablePosition;
+ }
+
+ public void setBigTablePosition(int bigTablePosition) {
+ this.bigTablePosition = bigTablePosition;
+ }
+
+ public Map> getKeyExprMap() {
+ return keyExprMap;
+ }
+
+ public void setKeyExprMap(Map> keyExprMap) {
+ this.keyExprMap = keyExprMap;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (working copy)
@@ -19,22 +19,19 @@
package org.apache.hadoop.hive.ql.optimizer;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.LinkedHashMap;
-import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.ql.ErrorMsg;
-import org.apache.hadoop.hive.ql.exec.DummyStoreOperator;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
@@ -44,18 +41,9 @@
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
-import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.metadata.Partition;
-import org.apache.hadoop.hive.ql.metadata.Table;
-import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.ParseContext;
-import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
-import org.apache.hadoop.hive.ql.parse.QB;
-import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
-import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
//try to replace a bucket map join with a sorted merge map join
public class SortedMergeBucketMapJoinOptimizer implements Transform {
@@ -66,9 +54,38 @@
public SortedMergeBucketMapJoinOptimizer() {
}
+ private void getListOfRejectedJoins(
+ ParseContext pctx, SortBucketJoinOptProcCtx smbJoinContext)
+ throws SemanticException {
+
+ // go through all joins - it should only contain selects and filters
+ Map opRules = new LinkedHashMap();
+ opRules.put(new RuleRegExp("R1", JoinOperator.getOperatorName() + "%"),
+ getCheckCandidateJoin());
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext);
+ GraphWalker ogw = new DefaultGraphWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.addAll(pctx.getTopOps().values());
+ ogw.startWalking(topNodes, null);
+ }
+
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
+ HiveConf conf = pctx.getConf();
+ SortBucketJoinOptProcCtx smbJoinContext =
+ new SortBucketJoinOptProcCtx(conf);
+ // Get a list of joins which cannot be converted to a sort merge join
+ // Only selects and filters operators are allowed between the table scan and
+ // join currently. More operators can be added - the method supportAutomaticSortMergeJoin
+ // dictates which operator is allowed
+ getListOfRejectedJoins(pctx, smbJoinContext);
+
Map opRules = new LinkedHashMap();
// go through all map joins and find out all which have enabled bucket map
// join.
@@ -76,7 +93,15 @@
getSortedMergeBucketMapjoinProc(pctx));
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
- Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
+
+ // There is no need for the user to specify mapjoin for it to be
+ // converted to sort-merge join
+ if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN)) {
+ opRules.put(new RuleRegExp("R2", "JOIN%"),
+ getSortedMergeJoinProc(pctx));
+ }
+
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext);
GraphWalker ogw = new DefaultGraphWalker(disp);
// Create a list of topop nodes
@@ -87,10 +112,41 @@
return pctx;
}
+ class SortedMergeJoinProc extends AbstractSMBJoinProc implements NodeProcessor {
+
+ public SortedMergeJoinProc(ParseContext pctx) {
+ super(pctx);
+ }
+
+ public SortedMergeJoinProc() {
+ }
+
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+
+ JoinOperator joinOp = (JoinOperator) nd;
+ SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx;
+
+ boolean convert =
+ canConvertJoinToSMBJoin(
+ joinOp, smbJoinContext, pGraphContext);
+
+ if (convert) {
+ convertJoinToSMBJoin(joinOp, smbJoinContext, pGraphContext);
+ }
+ return null;
+ }
+ }
+
private NodeProcessor getSortedMergeBucketMapjoinProc(ParseContext pctx) {
return new SortedMergeBucketMapjoinProc(pctx);
}
+ private NodeProcessor getSortedMergeJoinProc(ParseContext pctx) {
+ return new SortedMergeJoinProc(pctx);
+ }
+
private NodeProcessor getDefaultProc() {
return new NodeProcessor() {
@Override
@@ -102,71 +158,58 @@
};
}
- class SortedMergeBucketMapjoinProc extends AbstractBucketJoinProc implements NodeProcessor {
- private ParseContext pGraphContext;
+ // check if the join operator encountered is a candidate for being converted
+ // to a sort-merge join
+ private NodeProcessor getCheckCandidateJoin() {
+ return new NodeProcessor() {
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx;
+ JoinOperator joinOperator = (JoinOperator)nd;
+ int size = stack.size();
+ if (!(stack.get(size-1) instanceof JoinOperator) ||
+ !(stack.get(size-2) instanceof ReduceSinkOperator)) {
+ smbJoinContext.getSetOfRejectedJoins().add(joinOperator);
+ return null;
+ }
+ // If any operator in the stack does not support a auto-conversion, this join should
+ // not be converted.
+ for (int pos = size -3; pos >= 0; pos--) {
+ Operator extends OperatorDesc> op = (Operator extends OperatorDesc>)stack.get(pos);
+ if (!op.supportAutomaticSortMergeJoin()) {
+ smbJoinContext.getSetOfRejectedJoins().add(joinOperator);
+ return null;
+ }
+ }
+
+ return null;
+ }
+ };
+ }
+
+ class SortedMergeBucketMapjoinProc extends AbstractSMBJoinProc implements NodeProcessor {
public SortedMergeBucketMapjoinProc(ParseContext pctx) {
- this.pGraphContext = pctx;
+ super(pctx);
}
public SortedMergeBucketMapjoinProc() {
}
- // Return true or false based on whether the mapjoin was converted successfully to
- // a sort-merge map join operator.
- private boolean convertSMBJoin(Node nd, Stack stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
if (nd instanceof SMBMapJoinOperator) {
- return false;
+ return null;
}
+
MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
- if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
- || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
- return false;
- }
+ SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx;
- boolean tableSorted = true;
- QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()
- .get(mapJoinOp);
- if (joinCxt == null) {
- return false;
- }
- String[] srcs = joinCxt.getBaseSrc();
- for (int srcPos = 0; srcPos < srcs.length; srcPos++) {
- srcs[srcPos] = QB.getAppendedAliasFromId(joinCxt.getId(), srcs[srcPos]);
- }
+ boolean convert =
+ canConvertBucketMapJoinToSMBJoin(mapJoinOp, stack, smbJoinContext, nodeOutputs);
- // All the tables/partitions columns should be sorted in the same order
- // For example, if tables A and B are being joined on columns c1, c2 and c3
- // which are the sorted and bucketed columns. The join would work, as long
- // c1, c2 and c3 are sorted in the same order.
- List sortColumnsFirstTable = new ArrayList();
-
- for (int pos = 0; pos < srcs.length; pos++) {
- tableSorted = tableSorted
- && isTableSorted(this.pGraphContext,
- mapJoinOp,
- joinCxt,
- pos,
- sortColumnsFirstTable,
- srcs);
- }
- if (!tableSorted) {
- //this is a mapjoin but not suit for a sort merge bucket map join. check outer joins
- MapJoinProcessor.checkMapJoin(((MapJoinOperator) nd).getConf().getPosBigTable(),
- ((MapJoinOperator) nd).getConf().getConds());
- return false;
- }
- // convert a bucket map join operator to a sorted merge bucket map join
- // operator
- convertToSMBJoin(mapJoinOp, srcs);
- return true;
- }
-
- @Override
- public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- boolean convert = convertSMBJoin(nd, stack, procCtx, nodeOutputs);
// Throw an error if the user asked for sort merge bucketed mapjoin to be enforced
// and sort merge bucketed mapjoin cannot be performed
if (!convert &&
@@ -175,245 +218,10 @@
throw new SemanticException(ErrorMsg.SORTMERGE_MAPJOIN_FAILED.getMsg());
}
+ if (convert) {
+ convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext, pGraphContext);
+ }
return null;
}
-
- private SMBMapJoinOperator convertToSMBJoin(MapJoinOperator mapJoinOp,
- String[] srcs) {
- SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
- SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
- smbJop.setConf(smbJoinDesc);
- HashMap tagToAlias = new HashMap();
- for (int i = 0; i < srcs.length; i++) {
- tagToAlias.put((byte) i, srcs[i]);
- }
- smbJoinDesc.setTagToAlias(tagToAlias);
-
- int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
- if(indexInListMapJoinNoReducer >= 0 ) {
- this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
- this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
- }
-
- Map aliasToSink =
- new HashMap();
- // For all parents (other than the big table), insert a dummy store operator
- /* Consider a query like:
- *
- * select * from
- * (subq1 --> has a filter)
- * join
- * (subq2 --> has a filter)
- * on some key
- *
- * Let us assume that subq1 is the small table (either specified by the user or inferred
- * automatically). The following operator tree will be created:
- *
- * TableScan (subq1) --> Select --> Filter --> DummyStore
- * \
- * \ SMBJoin
- * /
- * /
- * TableScan (subq2) --> Select --> Filter
- */
- List extends Operator> parentOperators = mapJoinOp.getParentOperators();
- for (int i = 0; i < parentOperators.size(); i++) {
- Operator par = parentOperators.get(i);
- int index = par.getChildOperators().indexOf(mapJoinOp);
- par.getChildOperators().remove(index);
- if (i == smbJoinDesc.getPosBigTable()) {
- par.getChildOperators().add(index, smbJop);
- }
- else {
- DummyStoreOperator dummyStoreOp = new DummyStoreOperator();
- par.getChildOperators().add(index, dummyStoreOp);
-
- List> childrenOps =
- new ArrayList>();
- childrenOps.add(smbJop);
- dummyStoreOp.setChildOperators(childrenOps);
-
- List> parentOps =
- new ArrayList>();
- parentOps.add(par);
- dummyStoreOp.setParentOperators(parentOps);
-
- aliasToSink.put(srcs[i], dummyStoreOp);
- smbJop.getParentOperators().remove(i);
- smbJop.getParentOperators().add(i, dummyStoreOp);
- }
- }
- smbJoinDesc.setAliasToSink(aliasToSink);
- List extends Operator> childOps = mapJoinOp.getChildOperators();
- for (int i = 0; i < childOps.size(); i++) {
- Operator child = childOps.get(i);
- int index = child.getParentOperators().indexOf(mapJoinOp);
- child.getParentOperators().remove(index);
- child.getParentOperators().add(index, smbJop);
- }
- return smbJop;
- }
-
- /**
- * Whether this table is eligible for a sort-merge join.
- *
- * @param pctx parse context
- * @param op map join operator being considered
- * @param joinTree join tree being considered
- * @param alias table alias in the join tree being checked
- * @param pos position of the table
- * @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
- * It is not initialized when pos = 0.
- * @return
- * @throws SemanticException
- */
- private boolean isTableSorted(ParseContext pctx,
- MapJoinOperator op,
- QBJoinTree joinTree,
- int pos,
- List sortColumnsFirstTable,
- String[] aliases)
- throws SemanticException {
- String alias = aliases[pos];
- Map topToTable = this.pGraphContext
- .getTopToTable();
-
- /*
- * Consider a query like:
- *
- * select -- mapjoin(subq1) -- * from
- * (select a.key, a.value from tbl1 a) subq1
- * join
- * (select a.key, a.value from tbl2 a) subq2
- * on subq1.key = subq2.key;
- *
- * aliasToOpInfo contains the SelectOperator for subq1 and subq2.
- * We need to traverse the tree (using TableAccessAnalyzer) to get to the base
- * table. If the object being map-joined is a base table, then aliasToOpInfo
- * contains the TableScanOperator, and TableAccessAnalyzer is a no-op.
- */
- Operator extends OperatorDesc> topOp = joinTree.getAliasToOpInfo().get(alias);
- if (topOp == null) {
- return false;
- }
- List joinCols = toColumns(op.getConf().getKeys().get((byte) pos));
- if (joinCols == null || joinCols.isEmpty()) {
- return false;
- }
- TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols);
- if (tso == null) {
- return false;
- }
-
- // For nested sub-queries, the alias mapping is not maintained in QB currently.
- /*
- * Consider a query like:
- *
- * select count(*) from
- * (
- * select key, count(*) from
- * (
- * select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2
- * from tbl1 a join tbl2 b on a.key = b.key
- * ) subq1
- * group by key
- * ) subq2;
- *
- * The table alias should be subq2:subq1:a which needs to be fetched from topOps.
- */
- if (pGraphContext.getTopOps().containsValue(tso)) {
- for (Map.Entry> topOpEntry :
- this.pGraphContext.getTopOps().entrySet()) {
- if (topOpEntry.getValue() == tso) {
- alias = topOpEntry.getKey();
- aliases[pos] = alias;
- break;
- }
- }
- }
- else {
- // Ideally, this should never happen, and this should be an assert.
- return false;
- }
-
- Table tbl = topToTable.get(tso);
-
- if (tbl.isPartitioned()) {
- PrunedPartitionList prunedParts = null;
- try {
- prunedParts = pGraphContext.getOpToPartList().get(tso);
- if (prunedParts == null) {
- prunedParts = PartitionPruner.prune(tbl, pGraphContext
- .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
- pGraphContext.getPrunedPartitions());
- pGraphContext.getOpToPartList().put(tso, prunedParts);
- }
- } catch (HiveException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- throw new SemanticException(e.getMessage(), e);
- }
- List partitions = prunedParts.getNotDeniedPartns();
- // Populate the names and order of columns for the first partition of the
- // first table
- if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
- Partition firstPartition = partitions.get(0);
- sortColumnsFirstTable.addAll(firstPartition.getSortCols());
- }
-
- for (Partition partition : prunedParts.getNotDeniedPartns()) {
- if (!checkSortColsAndJoinCols(partition.getSortCols(),
- joinCols,
- sortColumnsFirstTable)) {
- return false;
- }
- }
- return true;
- }
-
- // Populate the names and order of columns for the first table
- if (pos == 0) {
- sortColumnsFirstTable.addAll(tbl.getSortCols());
- }
-
- return checkSortColsAndJoinCols(tbl.getSortCols(),
- joinCols,
- sortColumnsFirstTable);
- }
-
- private boolean checkSortColsAndJoinCols(List sortCols,
- List joinCols,
- List sortColumnsFirstPartition) {
-
- if (sortCols == null || sortCols.size() < joinCols.size()) {
- return false;
- }
-
- // A join is eligible for a sort-merge join, only if it is eligible for
- // a bucketized map join. So, we dont need to check for bucketized map
- // join here. We are guaranteed that the join keys contain all the
- // bucketized keys (note that the order need not be the same).
- List sortColNames = new ArrayList();
-
- // The join columns should contain all the sort columns
- // The sort columns of all the tables should be in the same order
- // compare the column names and the order with the first table/partition.
- for (int pos = 0; pos < sortCols.size(); pos++) {
- Order o = sortCols.get(pos);
- if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) {
- return false;
- }
- sortColNames.add(o.getCol());
- }
-
- // The column names and order (ascending/descending) matched
- // The first 'n' sorted columns should be the same as the joinCols, where
- // 'n' is the size of join columns.
- // For eg: if the table is sorted by (a,b,c), it is OK to convert if the join is
- // on (a), (a,b), or any combination of (a,b,c):
- // (a,b,c), (a,c,b), (c,a,b), (c,b,a), (b,c,a), (b,a,c)
- // but it is not OK to convert if the join is on (a,c)
- return sortColNames.subList(0, joinCols.size()).containsAll(joinCols);
- }
}
-
}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+
+/*
+ * This is a pluggable policy to chose the candidate map-join table for converting a join to a
+ * sort merge join. The leftmost table is chosen as the join table.
+ */
+public class LeftSortMergeJoinBigTableMatcher implements SortMergeJoinBigTableMatcher {
+ public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) {
+ return 0;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy)
@@ -74,12 +74,18 @@
}
transformations.add(new SamplePruner());
transformations.add(new MapJoinProcessor());
+ boolean bucketMapJoinOptimizer = false;
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) {
transformations.add(new BucketMapJoinOptimizer());
- if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) {
- transformations.add(new SortedMergeBucketMapJoinOptimizer());
+ }
+
+ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) {
+ if (!bucketMapJoinOptimizer) {
+ transformations.add(new BucketMapJoinOptimizer());
}
+ transformations.add(new SortedMergeBucketMapJoinOptimizer());
}
+
transformations.add(new UnionProcessor());
transformations.add(new JoinReorder());
if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) {
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (working copy)
@@ -0,0 +1,146 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+
+public class BucketJoinOptProcCtx implements NodeProcessorCtx {
+ private static final Log LOG =
+ LogFactory.getLog(BucketJoinOptProcCtx.class.getName());
+
+ private final HiveConf conf;
+
+ // we only convert map joins that follows a root table scan in the same
+ // mapper. That means there is no reducer between the root table scan and
+ // mapjoin.
+ private Set setOfRejectedMapjoins = new HashSet();
+ private Set setOfRejectedJoins = new HashSet();
+
+ // The list of join operators which can be converted to a bucketed map join
+ private Set setOfConvertedJoins = new HashSet();
+
+ private Map> aliasToPartitionBucketNumberMapping;
+ private Map>> aliasToPartitionBucketFileNamesMapping;
+ private Map> bigTblPartsToBucketFileNames;
+ private Map bigTblPartsToBucketNumber;
+ private List joinAliases;
+ private String baseBigAlias;
+ private boolean bigTablePartitioned;
+
+ public BucketJoinOptProcCtx(HiveConf conf) {
+ this.conf = conf;
+ }
+
+ public HiveConf getConf() {
+ return conf;
+ }
+
+ public Set getSetOfRejectedMapjoins() {
+ return setOfRejectedMapjoins;
+ }
+
+ public void setSetOfRejectedMapjoins(Set listOfRejectedMapjoins) {
+ this.setOfRejectedMapjoins = listOfRejectedMapjoins;
+ }
+
+ public Set getSetOfRejectedJoins() {
+ return setOfRejectedJoins;
+ }
+
+ public Set getSetOfConvertedJoins() {
+ return setOfConvertedJoins;
+ }
+
+ public void setSetOfRejectedJoins(Set setOfRejectedJoins) {
+ this.setOfRejectedJoins = setOfRejectedJoins;
+ }
+
+ public void setSetOfConvertedJoins(Set setOfConvertedJoins) {
+ this.setOfConvertedJoins = setOfConvertedJoins;
+ }
+
+ public Map> getAliasToPartitionBucketNumberMapping() {
+ return aliasToPartitionBucketNumberMapping;
+ }
+
+ public Map>> getAliasToPartitionBucketFileNamesMapping() {
+ return aliasToPartitionBucketFileNamesMapping;
+ }
+
+ public Map> getBigTblPartsToBucketFileNames() {
+ return bigTblPartsToBucketFileNames;
+ }
+
+ public Map getBigTblPartsToBucketNumber() {
+ return bigTblPartsToBucketNumber;
+ }
+
+ public void setAliasToPartitionBucketNumberMapping(
+ Map> aliasToPartitionBucketNumberMapping) {
+ this.aliasToPartitionBucketNumberMapping = aliasToPartitionBucketNumberMapping;
+ }
+
+ public void setAliasToPartitionBucketFileNamesMapping(
+ Map>> aliasToPartitionBucketFileNamesMapping) {
+ this.aliasToPartitionBucketFileNamesMapping = aliasToPartitionBucketFileNamesMapping;
+ }
+
+ public void setBigTblPartsToBucketFileNames(
+ Map> bigTblPartsToBucketFileNames) {
+ this.bigTblPartsToBucketFileNames = bigTblPartsToBucketFileNames;
+ }
+
+ public void setBigTblPartsToBucketNumber(Map bigTblPartsToBucketNumber) {
+ this.bigTblPartsToBucketNumber = bigTblPartsToBucketNumber;
+ }
+
+ public void setJoinAliases(List joinAliases) {
+ this.joinAliases = joinAliases;
+ }
+
+ public void setBaseBigAlias(String baseBigAlias) {
+ this.baseBigAlias = baseBigAlias;
+ }
+
+ public List getJoinAliases() {
+ return joinAliases;
+ }
+
+ public String getBaseBigAlias() {
+ return baseBigAlias;
+ }
+
+ public boolean isBigTablePartitioned() {
+ return bigTablePartitioned;
+ }
+
+ public void setBigTablePartitioned(boolean bigTablePartitioned) {
+ this.bigTablePartitioned = bigTablePartitioned;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+public interface SortMergeJoinBigTableMatcher {
+ public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp)
+ throws SemanticException;
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/CommonSizeSortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/CommonSizeSortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/CommonSizeSortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.List;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+
+/*
+ * This is a pluggable policy to chose the candidate map-join table for converting a join to a
+ * sort merge join. The largest table is chosen based on the size of the tables.
+ */
+public class CommonSizeSortMergeJoinBigTableMatcher {
+ protected void getListTopOps(
+ Operator extends OperatorDesc> op, List topOps) {
+ if ((op.getParentOperators() == null) ||
+ (op.getParentOperators().isEmpty())) {
+ return;
+ }
+
+ for (Operator extends OperatorDesc> parentOp : op.getParentOperators()) {
+ if (parentOp instanceof TableScanOperator) {
+ topOps.add((TableScanOperator)parentOp);
+ }
+ else {
+ getListTopOps(parentOp, topOps);
+ }
+ }
+ }
+
+ private long getSize(HiveConf conf, String size, Path path) {
+ // If the size is present in the metastore, use it
+ if (size != null) {
+ try {
+ return Long.valueOf(size);
+ } catch (NumberFormatException e) {
+ return 0;
+ }
+ }
+
+ try {
+ FileSystem fs = path.getFileSystem(conf);
+ return fs.getContentSummary(path).getLength();
+ } catch (Exception e) {
+ return 0;
+ }
+ }
+
+ protected long getSize(HiveConf conf, Table table) {
+ Path path = table.getPath();
+ String size = table.getProperty("totalSize");
+ return getSize(conf, size, path);
+ }
+
+ protected long getSize(HiveConf conf, Partition partition) {
+ Path path = partition.getPartitionPath();
+ String size = partition.getParameters().get("totalSize");
+
+ return getSize(conf, size, path);
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy)
@@ -17,32 +17,17 @@
*/
package org.apache.hadoop.hive.ql.optimizer;
-import java.io.IOException;
-import java.net.URI;
import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
import java.util.LinkedHashMap;
-import java.util.List;
import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
import java.util.Stack;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
-import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
@@ -53,18 +38,8 @@
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
-import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.metadata.Partition;
-import org.apache.hadoop.hive.ql.metadata.Table;
-import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.ParseContext;
-import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
-import org.apache.hadoop.hive.ql.parse.QB;
-import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer;
-import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
-import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* this transformation does bucket map join optimization.
@@ -81,8 +56,8 @@
public ParseContext transform(ParseContext pctx) throws SemanticException {
Map opRules = new LinkedHashMap();
- BucketMapjoinOptProcCtx bucketMapJoinOptimizeCtx =
- new BucketMapjoinOptProcCtx(pctx.getConf());
+ BucketJoinOptProcCtx bucketMapJoinOptimizeCtx =
+ new BucketJoinOptProcCtx(pctx.getConf());
// process map joins with no reducers pattern
opRules.put(new RuleRegExp("R1",
@@ -116,11 +91,11 @@
return new NodeProcessor() {
@Override
public Object process(Node nd, Stack stack,
- NodeProcessorCtx procCtx, Object... nodeOutputs)
+ NodeProcessorCtx procCtx, Object... nodeOutputs)
throws SemanticException {
MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
- BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
- context.listOfRejectedMapjoins.add(mapJoinOp);
+ BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx;
+ context.getSetOfRejectedMapjoins().add(mapJoinOp);
return null;
}
};
@@ -143,270 +118,20 @@
class BucketMapjoinOptProc extends AbstractBucketJoinProc implements NodeProcessor {
- protected ParseContext pGraphContext;
-
public BucketMapjoinOptProc(ParseContext pGraphContext) {
- super();
- this.pGraphContext = pGraphContext;
+ super(pGraphContext);
}
- private boolean convertBucketMapJoin(Node nd, Stack stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
- MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
- BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
- HiveConf conf = context.getConf();
-
- if (context.getListOfRejectedMapjoins().contains(mapJoinOp)) {
- return false;
- }
-
- QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
- if (joinCxt == null) {
- return false;
- }
-
- List joinAliases = new ArrayList();
- String[] srcs = joinCxt.getBaseSrc();
- String[] left = joinCxt.getLeftAliases();
- List mapAlias = joinCxt.getMapAliases();
- String baseBigAlias = null;
-
- for (String s : left) {
- if (s != null) {
- String subQueryAlias = QB.getAppendedAliasFromId(joinCxt.getId(), s);
- if (!joinAliases.contains(subQueryAlias)) {
- joinAliases.add(subQueryAlias);
- if(!mapAlias.contains(s)) {
- baseBigAlias = subQueryAlias;
- }
- }
- }
- }
-
- for (String s : srcs) {
- if (s != null) {
- String subQueryAlias = QB.getAppendedAliasFromId(joinCxt.getId(), s);
- if (!joinAliases.contains(subQueryAlias)) {
- joinAliases.add(subQueryAlias);
- if(!mapAlias.contains(s)) {
- baseBigAlias = subQueryAlias;
- }
- }
- }
- }
-
- MapJoinDesc mjDesc = mapJoinOp.getConf();
- LinkedHashMap> aliasToPartitionBucketNumberMapping =
- new LinkedHashMap>();
- LinkedHashMap>> aliasToPartitionBucketFileNamesMapping =
- new LinkedHashMap>>();
-
- Map> topOps =
- this.pGraphContext.getTopOps();
- Map topToTable = this.pGraphContext.getTopToTable();
-
- // (partition to bucket file names) and (partition to bucket number) for
- // the big table;
- LinkedHashMap> bigTblPartsToBucketFileNames = new LinkedHashMap>();
- LinkedHashMap bigTblPartsToBucketNumber = new LinkedHashMap();
-
- Integer[] orders = null; // accessing order of join cols to bucket cols, should be same
- boolean bigTablePartitioned = true;
- for (int index = 0; index < joinAliases.size(); index++) {
- String alias = joinAliases.get(index);
- Operator extends OperatorDesc> topOp = joinCxt.getAliasToOpInfo().get(alias);
- if (topOp == null) {
- return false;
- }
- List keys = toColumns(mjDesc.getKeys().get((byte) index));
- if (keys == null || keys.isEmpty()) {
- return false;
- }
- int oldKeySize = keys.size();
- TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys);
- if (tso == null) {
- return false;
- }
-
- // For nested sub-queries, the alias mapping is not maintained in QB currently.
- if (topOps.containsValue(tso)) {
- for (Map.Entry> topOpEntry : topOps.entrySet()) {
- if (topOpEntry.getValue() == tso) {
- String newAlias = topOpEntry.getKey();
- joinAliases.set(index, newAlias);
- if (baseBigAlias.equals(alias)) {
- baseBigAlias = newAlias;
- }
- alias = newAlias;
- break;
- }
- }
- }
- else {
- // Ideally, this should never happen, and this should be an assert.
- return false;
- }
-
- // The join keys cannot be transformed in the sub-query currently.
- // TableAccessAnalyzer.genRootTableScan will only return the base table scan
- // if the join keys are constants or a column. Even a simple cast of the join keys
- // will result in a null table scan operator. In case of constant join keys, they would
- // be removed, and the size before and after the genRootTableScan will be different.
- if (keys.size() != oldKeySize) {
- return false;
- }
- if (orders == null) {
- orders = new Integer[keys.size()];
- }
-
- Table tbl = topToTable.get(tso);
- if (tbl.isPartitioned()) {
- PrunedPartitionList prunedParts;
- try {
- prunedParts = pGraphContext.getOpToPartList().get(tso);
- if (prunedParts == null) {
- prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso),
- pGraphContext.getConf(), alias,
- pGraphContext.getPrunedPartitions());
- pGraphContext.getOpToPartList().put(tso, prunedParts);
- }
- } catch (HiveException e) {
- // Has to use full name to make sure it does not conflict with
- // org.apache.commons.lang.StringUtils
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- throw new SemanticException(e.getMessage(), e);
- }
- List partitions = prunedParts.getNotDeniedPartns();
- // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
- if (partitions.isEmpty()) {
- if (!alias.equals(baseBigAlias)) {
- aliasToPartitionBucketNumberMapping.put(alias, Arrays. asList());
- aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>());
- }
- } else {
- List buckets = new ArrayList();
- List> files = new ArrayList>();
- for (Partition p : partitions) {
- if (!checkBucketColumns(p.getBucketCols(), keys, orders)) {
- return false;
- }
- List fileNames = getOnePartitionBucketFileNames(p.getDataLocation());
- // The number of files for the table should be same as number of buckets.
- int bucketCount = p.getBucketCount();
- if (fileNames.size() != bucketCount) {
- String msg = "The number of buckets for table " +
- tbl.getTableName() + " partition " + p.getName() + " is " +
- p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
- throw new SemanticException(
- ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
- }
- if (alias.equals(baseBigAlias)) {
- bigTblPartsToBucketFileNames.put(p, fileNames);
- bigTblPartsToBucketNumber.put(p, bucketCount);
- } else {
- files.add(fileNames);
- buckets.add(bucketCount);
- }
- }
- if (!alias.equals(baseBigAlias)) {
- aliasToPartitionBucketNumberMapping.put(alias, buckets);
- aliasToPartitionBucketFileNamesMapping.put(alias, files);
- }
- }
- } else {
- if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) {
- return false;
- }
- List fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation());
- Integer num = new Integer(tbl.getNumBuckets());
- // The number of files for the table should be same as number of buckets.
- if (fileNames.size() != num) {
- String msg = "The number of buckets for table " +
- tbl.getTableName() + " is " + tbl.getNumBuckets() +
- ", whereas the number of files is " + fileNames.size();
- throw new SemanticException(
- ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
- }
- if (alias.equals(baseBigAlias)) {
- bigTblPartsToBucketFileNames.put(null, fileNames);
- bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
- bigTablePartitioned = false;
- } else {
- aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num));
- aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames));
- }
- }
- }
-
- // All tables or partitions are bucketed, and their bucket number is
- // stored in 'bucketNumbers', we need to check if the number of buckets in
- // the big table can be divided by no of buckets in small tables.
- for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) {
- if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) {
- return false;
- }
- }
-
- MapJoinDesc desc = mapJoinOp.getConf();
-
- Map>> aliasBucketFileNameMapping =
- new LinkedHashMap>>();
-
- // sort bucket names for the big table
- for (List partBucketNames : bigTblPartsToBucketFileNames.values()) {
- Collections.sort(partBucketNames);
- }
-
- // go through all small tables and get the mapping from bucket file name
- // in the big table to bucket file names in small tables.
- for (int j = 0; j < joinAliases.size(); j++) {
- String alias = joinAliases.get(j);
- if (alias.equals(baseBigAlias)) {
- continue;
- }
- for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) {
- Collections.sort(names);
- }
- List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias);
- List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias);
-
- Map> mapping = new LinkedHashMap>();
- aliasBucketFileNameMapping.put(alias, mapping);
-
- // for each bucket file in big table, get the corresponding bucket file
- // name in the small table.
- // more than 1 partition in the big table, do the mapping for each partition
- Iterator>> bigTblPartToBucketNames =
- bigTblPartsToBucketFileNames.entrySet().iterator();
- Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber
- .entrySet().iterator();
- while (bigTblPartToBucketNames.hasNext()) {
- assert bigTblPartToBucketNum.hasNext();
- int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
- List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
- fillMapping(smallTblBucketNums, smallTblFilesList,
- mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping());
- }
- }
- desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
- desc.setBigTableAlias(baseBigAlias);
- if (bigTablePartitioned) {
- desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
- }
- // successfully convert to bucket map join
- desc.setBucketMapJoin(true);
-
- return true;
- }
-
-
@Override
@SuppressWarnings("unchecked")
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
+ BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx;
+ MapJoinOperator mapJoinOperator = (MapJoinOperator) nd;
- boolean convert = convertBucketMapJoin(nd, stack, procCtx, nodeOutputs);
- BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
+ // can the mapjoin present be converted to a bucketed mapjoin
+ boolean convert = canConvertMapJoinToBucketMapJoin(
+ mapJoinOperator, pGraphContext, context);
HiveConf conf = context.getConf();
// Throw an error if the user asked for bucketed mapjoin to be enforced and
@@ -415,119 +140,12 @@
throw new SemanticException(ErrorMsg.BUCKET_MAPJOIN_NOT_POSSIBLE.getMsg());
}
- return null;
- }
-
- // convert partition to partition spec string
- private Map> convert(Map> mapping) {
- Map> converted = new HashMap>();
- for (Map.Entry> entry : mapping.entrySet()) {
- converted.put(entry.getKey().getName(), entry.getValue());
+ if (convert) {
+ // convert the mapjoin to a bucketized mapjoin
+ convertMapJoinToBucketMapJoin(mapJoinOperator, context);
}
- return converted;
- }
- // called for each partition of big table and populates mapping for each file in the partition
- private void fillMapping(
- List smallTblBucketNums,
- List> smallTblFilesList,
- Map> mapping,
- int bigTblBucketNum, List bigTblBucketNameList,
- Map bucketFileNameMapping) {
-
- for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) {
- ArrayList resultFileNames = new ArrayList();
- for (int sindex = 0; sindex < smallTblBucketNums.size(); sindex++) {
- int smallTblBucketNum = smallTblBucketNums.get(sindex);
- List smallTblFileNames = smallTblFilesList.get(sindex);
- if (bigTblBucketNum >= smallTblBucketNum) {
- // if the big table has more buckets than the current small table,
- // use "MOD" to get small table bucket names. For example, if the big
- // table has 4 buckets and the small table has 2 buckets, then the
- // mapping should be 0->0, 1->1, 2->0, 3->1.
- int toAddSmallIndex = bindex % smallTblBucketNum;
- resultFileNames.add(smallTblFileNames.get(toAddSmallIndex));
- } else {
- int jump = smallTblBucketNum / bigTblBucketNum;
- for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) {
- resultFileNames.add(smallTblFileNames.get(i));
- }
- }
- }
- String inputBigTBLBucket = bigTblBucketNameList.get(bindex);
- mapping.put(inputBigTBLBucket, resultFileNames);
- bucketFileNameMapping.put(inputBigTBLBucket, bindex);
- }
+ return null;
}
-
- private boolean checkBucketNumberAgainstBigTable(
- Map> aliasToBucketNumber, int bucketNumberInPart) {
- for (List bucketNums : aliasToBucketNumber.values()) {
- for (int nxt : bucketNums) {
- boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0
- : bucketNumberInPart % nxt == 0;
- if (!ok) {
- return false;
- }
- }
- }
- return true;
- }
-
- private List getOnePartitionBucketFileNames(URI location)
- throws SemanticException {
- List fileNames = new ArrayList();
- try {
- FileSystem fs = FileSystem.get(location, this.pGraphContext.getConf());
- FileStatus[] files = fs.listStatus(new Path(location.toString()));
- if (files != null) {
- for (FileStatus file : files) {
- fileNames.add(file.getPath().toString());
- }
- }
- } catch (IOException e) {
- throw new SemanticException(e);
- }
- return fileNames;
- }
-
- private boolean checkBucketColumns(List bucketColumns, List keys,
- Integer[] orders) {
- if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) {
- return false;
- }
- for (int i = 0; i < keys.size(); i++) {
- int index = bucketColumns.indexOf(keys.get(i));
- if (orders[i] != null && orders[i] != index) {
- return false;
- }
- orders[i] = index;
- }
- // Check if the join columns contains all bucket columns.
- // If a table is bucketized on column B, but the join key is A and B,
- // it is easy to see joining on different buckets yield empty results.
- return keys.containsAll(bucketColumns);
- }
}
-
- class BucketMapjoinOptProcCtx implements NodeProcessorCtx {
- private final HiveConf conf;
-
- // we only convert map joins that follows a root table scan in the same
- // mapper. That means there is no reducer between the root table scan and
- // mapjoin.
- Set listOfRejectedMapjoins = new HashSet();
-
- public BucketMapjoinOptProcCtx(HiveConf conf) {
- this.conf = conf;
- }
-
- public HiveConf getConf() {
- return conf;
- }
-
- public Set getListOfRejectedMapjoins() {
- return listOfRejectedMapjoins;
- }
- }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (working copy)
@@ -17,34 +17,428 @@
*/
package org.apache.hadoop.hive.ql.optimizer;
+import java.io.IOException;
+import java.net.URI;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
import java.util.Stack;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.QB;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer;
/**
* this transformation does bucket map join optimization.
*/
abstract public class AbstractBucketJoinProc implements NodeProcessor {
+ private static final Log LOG =
+ LogFactory.getLog(AbstractBucketJoinProc.class.getName());
- private static final Log LOG = LogFactory.getLog(AbstractBucketJoinProc.class.getName());
+ protected ParseContext pGraphContext;
+ public AbstractBucketJoinProc(ParseContext pGraphContext) {
+ this.pGraphContext = pGraphContext;
+ }
+
public AbstractBucketJoinProc() {
}
@Override
abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException;
+ Object... nodeOutputs) throws SemanticException;
- public List toColumns(List keys) {
+ private static List getOnePartitionBucketFileNames(
+ URI location, ParseContext pGraphContext) throws SemanticException {
+ List fileNames = new ArrayList();
+ try {
+ FileSystem fs = FileSystem.get(location, pGraphContext.getConf());
+ FileStatus[] files = fs.listStatus(new Path(location.toString()));
+ if (files != null) {
+ for (FileStatus file : files) {
+ fileNames.add(file.getPath().toString());
+ }
+ }
+ } catch (IOException e) {
+ throw new SemanticException(e);
+ }
+ return fileNames;
+ }
+
+ private boolean checkBucketColumns(List bucketColumns,
+ List keys,
+ Integer[] orders) {
+ if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) {
+ return false;
+ }
+ for (int i = 0; i < keys.size(); i++) {
+ int index = bucketColumns.indexOf(keys.get(i));
+ if (orders[i] != null && orders[i] != index) {
+ return false;
+ }
+ orders[i] = index;
+ }
+
+ // Check if the join columns contains all bucket columns.
+ // If a table is bucketized on column B, but the join key is A and B,
+ // it is easy to see joining on different buckets yield empty results.
+ return keys.containsAll(bucketColumns);
+ }
+
+ private boolean checkBucketNumberAgainstBigTable(
+ Map> aliasToBucketNumber, int bucketNumberInPart) {
+ for (List bucketNums : aliasToBucketNumber.values()) {
+ for (int nxt : bucketNums) {
+ boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0
+ : bucketNumberInPart % nxt == 0;
+ if (!ok) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ protected boolean canConvertMapJoinToBucketMapJoin(
+ MapJoinOperator mapJoinOp,
+ ParseContext pGraphContext,
+ BucketJoinOptProcCtx context) throws SemanticException {
+
+ if (context.getSetOfRejectedMapjoins().contains(mapJoinOp)) {
+ return false;
+ }
+
+ QBJoinTree joinCtx = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
+ if (joinCtx == null) {
+ return false;
+ }
+
+ List joinAliases = new ArrayList();
+ String[] srcs = joinCtx.getBaseSrc();
+ String[] left = joinCtx.getLeftAliases();
+ List mapAlias = joinCtx.getMapAliases();
+ String baseBigAlias = null;
+
+ for (String s : left) {
+ if (s != null) {
+ String subQueryAlias = QB.getAppendedAliasFromId(joinCtx.getId(), s);
+ if (!joinAliases.contains(subQueryAlias)) {
+ joinAliases.add(subQueryAlias);
+ if (!mapAlias.contains(s)) {
+ baseBigAlias = subQueryAlias;
+ }
+ }
+ }
+ }
+
+ for (String s : srcs) {
+ if (s != null) {
+ String subQueryAlias = QB.getAppendedAliasFromId(joinCtx.getId(), s);
+ if (!joinAliases.contains(subQueryAlias)) {
+ joinAliases.add(subQueryAlias);
+ if (!mapAlias.contains(s)) {
+ baseBigAlias = subQueryAlias;
+ }
+ }
+ }
+ }
+
+ Map> keysMap = mapJoinOp.getConf().getKeys();
+
+ return checkConvertBucketMapJoin(
+ pGraphContext,
+ context,
+ joinCtx,
+ keysMap,
+ baseBigAlias,
+ joinAliases);
+ }
+
+ protected boolean checkConvertBucketMapJoin(
+ ParseContext pGraphContext,
+ BucketJoinOptProcCtx context,
+ QBJoinTree joinCtx,
+ Map> keysMap,
+ String baseBigAlias,
+ List joinAliases) throws SemanticException {
+
+ LinkedHashMap> aliasToPartitionBucketNumberMapping =
+ new LinkedHashMap>();
+ LinkedHashMap>> aliasToPartitionBucketFileNamesMapping =
+ new LinkedHashMap>>();
+
+ HashMap> topOps = pGraphContext.getTopOps();
+ Map topToTable = pGraphContext.getTopToTable();
+
+ // (partition to bucket file names) and (partition to bucket number) for
+ // the big table;
+ LinkedHashMap> bigTblPartsToBucketFileNames =
+ new LinkedHashMap>();
+ LinkedHashMap bigTblPartsToBucketNumber =
+ new LinkedHashMap();
+
+ Integer[] orders = null; // accessing order of join cols to bucket cols, should be same
+ boolean bigTablePartitioned = true;
+ for (int index = 0; index < joinAliases.size(); index++) {
+ String alias = joinAliases.get(index);
+ Operator extends OperatorDesc> topOp = joinCtx.getAliasToOpInfo().get(alias);
+ if (topOp == null) {
+ return false;
+ }
+ List keys = toColumns(keysMap.get((byte) index));
+ if (keys == null || keys.isEmpty()) {
+ return false;
+ }
+ int oldKeySize = keys.size();
+ TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys);
+ if (tso == null) {
+ return false;
+ }
+
+ // For nested sub-queries, the alias mapping is not maintained in QB currently.
+ if (topOps.containsValue(tso)) {
+ for (Map.Entry> topOpEntry : topOps.entrySet()) {
+ if (topOpEntry.getValue() == tso) {
+ String newAlias = topOpEntry.getKey();
+ joinAliases.set(index, newAlias);
+ if (baseBigAlias.equals(alias)) {
+ baseBigAlias = newAlias;
+ }
+ alias = newAlias;
+ break;
+ }
+ }
+ }
+ else {
+ // Ideally, this should never happen, and this should be an assert.
+ return false;
+ }
+
+ // The join keys cannot be transformed in the sub-query currently.
+ // TableAccessAnalyzer.genRootTableScan will only return the base table scan
+ // if the join keys are constants or a column. Even a simple cast of the join keys
+ // will result in a null table scan operator. In case of constant join keys, they would
+ // be removed, and the size before and after the genRootTableScan will be different.
+ if (keys.size() != oldKeySize) {
+ return false;
+ }
+
+ if (orders == null) {
+ orders = new Integer[keys.size()];
+ }
+
+ Table tbl = topToTable.get(tso);
+ if (tbl.isPartitioned()) {
+ PrunedPartitionList prunedParts;
+ try {
+ prunedParts = pGraphContext.getOpToPartList().get(tso);
+ if (prunedParts == null) {
+ prunedParts =
+ PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso),
+ pGraphContext.getConf(), alias,
+ pGraphContext.getPrunedPartitions());
+ pGraphContext.getOpToPartList().put(tso, prunedParts);
+ }
+ } catch (HiveException e) {
+ // Has to use full name to make sure it does not conflict with
+ // org.apache.commons.lang.StringUtils
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ throw new SemanticException(e.getMessage(), e);
+ }
+ List partitions = prunedParts.getNotDeniedPartns();
+ // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
+ if (partitions.isEmpty()) {
+ if (!alias.equals(baseBigAlias)) {
+ aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList());
+ aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>());
+ }
+ } else {
+ List buckets = new ArrayList();
+ List> files = new ArrayList>();
+ for (Partition p : partitions) {
+ if (!checkBucketColumns(p.getBucketCols(), keys, orders)) {
+ return false;
+ }
+ List fileNames =
+ getOnePartitionBucketFileNames(p.getDataLocation(), pGraphContext);
+ // The number of files for the table should be same as number of buckets.
+ int bucketCount = p.getBucketCount();
+
+ if (fileNames.size() != bucketCount) {
+ String msg = "The number of buckets for table " +
+ tbl.getTableName() + " partition " + p.getName() + " is " +
+ p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
+ throw new SemanticException(
+ ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
+ }
+
+ if (alias.equals(baseBigAlias)) {
+ bigTblPartsToBucketFileNames.put(p, fileNames);
+ bigTblPartsToBucketNumber.put(p, bucketCount);
+ } else {
+ files.add(fileNames);
+ buckets.add(bucketCount);
+ }
+ }
+ if (!alias.equals(baseBigAlias)) {
+ aliasToPartitionBucketNumberMapping.put(alias, buckets);
+ aliasToPartitionBucketFileNamesMapping.put(alias, files);
+ }
+ }
+ } else {
+ if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) {
+ return false;
+ }
+ List fileNames =
+ getOnePartitionBucketFileNames(tbl.getDataLocation(), pGraphContext);
+ Integer num = new Integer(tbl.getNumBuckets());
+
+ // The number of files for the table should be same as number of buckets.
+ if (fileNames.size() != num) {
+ String msg = "The number of buckets for table " +
+ tbl.getTableName() + " is " + tbl.getNumBuckets() +
+ ", whereas the number of files is " + fileNames.size();
+ throw new SemanticException(
+ ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
+ }
+
+ if (alias.equals(baseBigAlias)) {
+ bigTblPartsToBucketFileNames.put(null, fileNames);
+ bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
+ bigTablePartitioned = false;
+ } else {
+ aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num));
+ aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames));
+ }
+ }
+ }
+
+ // All tables or partitions are bucketed, and their bucket number is
+ // stored in 'bucketNumbers', we need to check if the number of buckets in
+ // the big table can be divided by no of buckets in small tables.
+ for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) {
+ if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) {
+ return false;
+ }
+ }
+
+ context.setAliasToPartitionBucketNumberMapping(aliasToPartitionBucketNumberMapping);
+ context.setAliasToPartitionBucketFileNamesMapping(aliasToPartitionBucketFileNamesMapping);
+ context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames);
+ context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber);
+ context.setJoinAliases(joinAliases);
+ context.setBaseBigAlias(baseBigAlias);
+ context.setBigTablePartitioned(bigTablePartitioned);
+
+ return true;
+ }
+
+ protected void convertMapJoinToBucketMapJoin(
+ MapJoinOperator mapJoinOp,
+ BucketJoinOptProcCtx context) throws SemanticException {
+ MapJoinDesc desc = mapJoinOp.getConf();
+
+ Map>> aliasBucketFileNameMapping =
+ new LinkedHashMap>>();
+
+ Map> aliasToPartitionBucketNumberMapping =
+ context.getAliasToPartitionBucketNumberMapping();
+
+ Map>> aliasToPartitionBucketFileNamesMapping =
+ context.getAliasToPartitionBucketFileNamesMapping();
+
+ Map> bigTblPartsToBucketFileNames =
+ context.getBigTblPartsToBucketFileNames();
+
+ Map bigTblPartsToBucketNumber =
+ context.getBigTblPartsToBucketNumber();
+
+ List joinAliases = context.getJoinAliases();
+ String baseBigAlias = context.getBaseBigAlias();
+
+ // sort bucket names for the big table
+ for (List partBucketNames : bigTblPartsToBucketFileNames.values()) {
+ Collections.sort(partBucketNames);
+ }
+
+ // go through all small tables and get the mapping from bucket file name
+ // in the big table to bucket file names in small tables.
+ for (int j = 0; j < joinAliases.size(); j++) {
+ String alias = joinAliases.get(j);
+ if (alias.equals(baseBigAlias)) {
+ continue;
+ }
+ for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) {
+ Collections.sort(names);
+ }
+ List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias);
+ List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias);
+
+ Map> mapping = new LinkedHashMap>();
+ aliasBucketFileNameMapping.put(alias, mapping);
+
+ // for each bucket file in big table, get the corresponding bucket file
+ // name in the small table.
+ // more than 1 partition in the big table, do the mapping for each partition
+ Iterator>> bigTblPartToBucketNames =
+ bigTblPartsToBucketFileNames.entrySet().iterator();
+ Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber
+ .entrySet().iterator();
+ while (bigTblPartToBucketNames.hasNext()) {
+ assert bigTblPartToBucketNum.hasNext();
+ int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
+ List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
+ fillMapping(smallTblBucketNums, smallTblFilesList,
+ mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping());
+ }
+ }
+ desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
+ desc.setBigTableAlias(baseBigAlias);
+ boolean bigTablePartitioned = context.isBigTablePartitioned();
+ if (bigTablePartitioned) {
+ desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
+ }
+ }
+
+ // convert partition to partition spec string
+ private static Map> convert(Map> mapping) {
+ Map> converted = new HashMap>();
+ for (Map.Entry> entry : mapping.entrySet()) {
+ converted.put(entry.getKey().getName(), entry.getValue());
+ }
+ return converted;
+ }
+
+ private List toColumns(List keys) {
List columns = new ArrayList();
for (ExprNodeDesc key : keys) {
if (!(key instanceof ExprNodeColumnDesc)) {
@@ -54,4 +448,37 @@
}
return columns;
}
+
+ // called for each partition of big table and populates mapping for each file in the partition
+ private static void fillMapping(
+ List smallTblBucketNums,
+ List> smallTblFilesList,
+ Map> mapping,
+ int bigTblBucketNum, List bigTblBucketNameList,
+ Map bucketFileNameMapping) {
+
+ for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) {
+ ArrayList resultFileNames = new ArrayList();
+ for (int sindex = 0 ; sindex < smallTblBucketNums.size(); sindex++) {
+ int smallTblBucketNum = smallTblBucketNums.get(sindex);
+ List smallTblFileNames = smallTblFilesList.get(sindex);
+ if (bigTblBucketNum >= smallTblBucketNum) {
+ // if the big table has more buckets than the current small table,
+ // use "MOD" to get small table bucket names. For example, if the big
+ // table has 4 buckets and the small table has 2 buckets, then the
+ // mapping should be 0->0, 1->1, 2->0, 3->1.
+ int toAddSmallIndex = bindex % smallTblBucketNum;
+ resultFileNames.add(smallTblFileNames.get(toAddSmallIndex));
+ } else {
+ int jump = smallTblBucketNum / bigTblBucketNum;
+ for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) {
+ resultFileNames.add(smallTblFileNames.get(i));
+ }
+ }
+ }
+ String inputBigTBLBucket = bigTblBucketNameList.get(bindex);
+ mapping.put(inputBigTBLBucket, resultFileNames);
+ bucketFileNameMapping.put(inputBigTBLBucket, bindex);
+ }
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (working copy)
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+/*
+ * This is a pluggable policy to chose the candidate map-join table for converting a join to a
+ * sort merge join. The largest table is chosen based on the size of the tables.
+ */
+public class SizeSortMergeJoinBigTableMatcher extends CommonSizeSortMergeJoinBigTableMatcher
+implements SortMergeJoinBigTableMatcher {
+ public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp)
+ throws SemanticException {
+ int bigTablePos = 0;
+ long maxSize = 0;
+ HiveConf conf = parseCtx.getConf();
+
+ try {
+ List topOps = new ArrayList();
+ getListTopOps(joinOp, topOps);
+ int currentPos = 0;
+ for (TableScanOperator topOp : topOps) {
+ Table table = parseCtx.getTopToTable().get(topOp);
+ long currentSize = 0;
+
+ if (!table.isPartitioned()) {
+ currentSize = getSize(conf, table);
+ }
+ else {
+ // For partitioned tables, get the size of all the partitions
+ PrunedPartitionList partsList =
+ PartitionPruner.prune(parseCtx.getTopToTable().get(topOp),
+ parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(),
+ null, parseCtx.getPrunedPartitions());
+ for (Partition part : partsList.getNotDeniedPartns()) {
+ currentSize += getSize(conf, part);
+ }
+ }
+
+ if (currentSize > maxSize) {
+ maxSize = currentSize;
+ bigTablePos = currentPos;
+ }
+ currentPos++;
+ }
+ } catch (HiveException e) {
+ throw new SemanticException(e.getMessage());
+ }
+
+ return bigTablePos;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (working copy)
@@ -76,6 +76,7 @@
public static final int CONVERTED_LOCAL_MAPJOIN = 3;
public static final int BACKUP_COMMON_JOIN = 4;
public static final int LOCAL_MAPJOIN=5;
+ public static final int CONVERTED_SORTMERGEJOIN = 6;
// Descendants tasks who subscribe feeds from this task
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy)
@@ -172,6 +172,11 @@
}
@Override
+ public boolean supportAutomaticSortMergeJoin() {
+ return true;
+ }
+
+ @Override
public boolean supportUnionRemoveOptimization() {
return true;
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (working copy)
@@ -76,6 +76,11 @@
private transient boolean inputFileChanged = false;
transient boolean localWorkInited = false;
+ // This join has been converted to a SMB join by the hive optimizer. The user did not
+ // give a mapjoin hint in the query. The hive optimizer figured out that the join can be
+ // performed as a smb join, based on all the tables/partitions being joined.
+ private transient boolean convertedAutomaticallySMBJoin = false;
+
public SMBMapJoinOperator() {
}
@@ -622,6 +627,14 @@
return OperatorType.MAPJOIN;
}
+ public boolean isConvertedAutomaticallySMBJoin() {
+ return convertedAutomaticallySMBJoin;
+ }
+
+ public void setConvertedAutomaticallySMBJoin(boolean convertedAutomaticallySMBJoin) {
+ this.convertedAutomaticallySMBJoin = convertedAutomaticallySMBJoin;
+ }
+
// returns rows from possibly multiple bucket files of small table in ascending order
// by utilizing primary queue (borrowed from hadoop)
// elements of queue (Integer) are index to FetchOperator[] (segments)
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy)
@@ -112,6 +112,11 @@
}
@Override
+ public boolean supportAutomaticSortMergeJoin() {
+ return true;
+ }
+
+ @Override
public boolean supportUnionRemoveOptimization() {
return true;
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy)
@@ -295,4 +295,9 @@
public boolean supportSkewJoinOptimization() {
return true;
}
+
+ @Override
+ public boolean supportAutomaticSortMergeJoin() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy)
@@ -1426,6 +1426,15 @@
this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat;
}
+ /**
+ * Whether this operator supports automatic sort merge join.
+ * The stack is traversed, and this method is invoked for all the operators.
+ * @return TRUE if yes, FALSE otherwise.
+ */
+ public boolean supportAutomaticSortMergeJoin() {
+ return false;
+ }
+
public boolean supportUnionRemoveOptimization() {
return false;
}
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java (working copy)
@@ -31,7 +31,7 @@
private MapredLocalWork localWork;
- //keep a mapping from tag to the fetch operator alias
+ // keep a mapping from tag to the fetch operator alias
private HashMap tagToAlias;
private Map aliasToSink;
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy)
@@ -71,6 +71,7 @@
import org.apache.hadoop.hive.ql.exec.RecordWriter;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.StatsTask;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
@@ -195,6 +196,7 @@
private List loadTableWork;
private List loadFileWork;
private Map joinContext;
+ private Map smbMapJoinContext;
private final HashMap topToTable;
private QB qb;
private ASTNode ast;
@@ -249,6 +251,7 @@
loadFileWork = new ArrayList();
opParseCtx = new LinkedHashMap, OpParseContext>();
joinContext = new HashMap();
+ smbMapJoinContext = new HashMap();
topToTable = new HashMap();
destTableId = 1;
uCtx = null;
@@ -277,6 +280,7 @@
ast = null;
uCtx = null;
joinContext.clear();
+ smbMapJoinContext.clear();
opParseCtx.clear();
groupOpToInputTables.clear();
prunedPartitions.clear();
@@ -292,6 +296,7 @@
loadTableWork = pctx.getLoadTableWork();
loadFileWork = pctx.getLoadFileWork();
joinContext = pctx.getJoinContext();
+ smbMapJoinContext = pctx.getSmbMapJoinContext();
ctx = pctx.getContext();
destTableId = pctx.getDestTableId();
idToTableNameMap = pctx.getIdToTableNameMap();
@@ -306,7 +311,7 @@
public ParseContext getParseContext() {
return new ParseContext(conf, qb, ast, opToPartPruner, opToPartList, topOps,
- topSelOps, opParseCtx, joinContext, topToTable, loadTableWork,
+ topSelOps, opParseCtx, joinContext, smbMapJoinContext, topToTable, loadTableWork,
loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks,
@@ -8475,7 +8480,8 @@
}
ParseContext pCtx = new ParseContext(conf, qb, child, opToPartPruner,
- opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable,
+ opToPartList, topOps, topSelOps, opParseCtx, joinContext, smbMapJoinContext,
+ topToTable,
loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks,
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (revision 1432851)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (working copy)
@@ -34,6 +34,7 @@
import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.hooks.LineageInfo;
@@ -71,6 +72,7 @@
private LinkedHashMap, OpParseContext> opParseCtx;
private Map joinContext;
private Map mapJoinContext;
+ private Map smbMapJoinContext;
private HashMap topToTable;
private HashMap nameToSplitSample;
private List loadTableWork;
@@ -160,6 +162,7 @@
HashMap> topSelOps,
LinkedHashMap