Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1431968) +++ conf/hive-default.xml.template (working copy) @@ -916,6 +916,24 @@ + hive.auto.sort.merge.join + false + Will the join be automatically converted to a sort-merge join, if the joined tables pass + the criteria for sort-merge join. + + + + + hive.auto.sort.merge.join.bigtbl.matcher + org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher + The policy to choose the big table for automatic conversion to sort-merge join. + By default, the leftmost table is assigned the big table. Other policies are based on size: + org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher. + New policies can be added in future. + + + + hive.metastore.ds.connection.url.hook Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used Index: build.properties =================================================================== --- build.properties (revision 1431968) +++ build.properties (working copy) @@ -79,7 +79,7 @@ # (measured in milliseconds). Ignored if fork is disabled. When running # multiple tests inside the same Java VM (see forkMode), timeout # applies to the time that all tests use together, not to an individual test. -test.junit.timeout=43200000 +test.junit.timeout=432000000 # Use this property to selectively disable tests from the command line: # ant test -Dtest.junit.exclude="**/TestCliDriver.class" Index: data/files/smallsrcsortbucket4outof4.txt =================================================================== --- data/files/smallsrcsortbucket4outof4.txt (revision 0) +++ data/files/smallsrcsortbucket4outof4.txt (working copy) @@ -0,0 +1,5 @@ +146val_146 +193val_193 +432val_432 +65val_65 +83val_83 Index: data/files/smallsrcsortbucket1outof4.txt =================================================================== --- data/files/smallsrcsortbucket1outof4.txt (revision 0) +++ data/files/smallsrcsortbucket1outof4.txt (working copy) @@ -0,0 +1,5 @@ +0val_0 +103val_103 +169val_169 +172val_172 +374val_374 Index: data/files/smallsrcsortbucket2outof4.txt =================================================================== --- data/files/smallsrcsortbucket2outof4.txt (revision 0) +++ data/files/smallsrcsortbucket2outof4.txt (working copy) @@ -0,0 +1,5 @@ +180val_180 +221val_221 +379val_379 +478val_478 +74val_74 Index: data/files/smallsrcsortbucket3outof4.txt =================================================================== --- data/files/smallsrcsortbucket3outof4.txt (revision 0) +++ data/files/smallsrcsortbucket3outof4.txt (working copy) @@ -0,0 +1,5 @@ +233val_233 +424val_424 +468val_468 +53val_53 +97val_97 Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1431968) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -498,6 +498,10 @@ HIVEENFORCESORTMERGEBUCKETMAPJOIN("hive.enforce.sortmergebucketmapjoin", false), HIVEENFORCEBUCKETMAPJOIN("hive.enforce.bucketmapjoin", false), + HIVE_AUTO_SORT_MERGE_JOIN("hive.auto.sort.merge.join", false), + HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER("hive.auto.sort.merge.join.bigtbl.matcher", + "org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher"), + HIVESCRIPTOPERATORTRUST("hive.exec.script.trust", false), HIVEROWOFFSET("hive.exec.rowoffset", false), Index: ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (working copy) @@ -0,0 +1,761 @@ +PREHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +#### A masked pattern was here #### +38 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +#### A masked pattern was here #### +Execution failed with exit status: 3 +Obtaining error information + +Task failed! +Task ID: + Stage-8 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (working copy) @@ -0,0 +1,784 @@ +PREHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket + +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket + +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +Execution failed with exit status: 3 +Obtaining error information + +Task failed! +Task ID: + Stage-8 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (working copy) @@ -0,0 +1,579 @@ +PREHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_small +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_small +#### A masked pattern was here #### +19 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_small +#### A masked pattern was here #### +Execution failed with exit status: 3 +Obtaining error information + +Task failed! +Task ID: + Stage-8 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_small +#### A masked pattern was here #### +19 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (working copy) @@ -0,0 +1,729 @@ +PREHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +Execution failed with exit status: 3 +Obtaining error information + +Task failed! +Task ID: + Stage-8 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (working copy) @@ -0,0 +1,360 @@ +PREHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- By setting hive.mapjoin.smalltable.filesize to a small number, none of the tables is a candidate for mapjoin +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- By setting hive.mapjoin.smalltable.filesize to a small number, none of the tables is a candidate for mapjoin +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-1 + Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (working copy) @@ -0,0 +1,717 @@ +PREHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +Execution failed with exit status: 3 +Obtaining error information + +Task failed! +Task ID: + Stage-8 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (working copy) @@ -0,0 +1,898 @@ +PREHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +Execution failed with exit status: 3 +Obtaining error information + +Task failed! +Task ID: + Stage-8 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (working copy) @@ -0,0 +1,770 @@ +PREHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 +PREHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +Execution failed with exit status: 3 +Obtaining error information + +Task failed! +Task ID: + Stage-8 + +Logs: + +#### A masked pattern was here #### +FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.MapredLocalTask +ATTEMPT: Execute BackupTask: org.apache.hadoop.hive.ql.exec.MapRedTask +POSTHOOK: query: -- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (working copy) @@ -0,0 +1,31 @@ +-- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (working copy) @@ -0,0 +1,36 @@ +-- small 2 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (working copy) @@ -0,0 +1,33 @@ +-- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (working copy) @@ -0,0 +1,38 @@ +-- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.mapjoin.smalltable.filesize=50; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- By setting hive.mapjoin.smalltable.filesize to a small number, none of the tables is a candidate for mapjoin +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (working copy) @@ -0,0 +1,34 @@ +-- small 1 part, 2 bucket & big 2 part, 4 bucket + +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (working copy) @@ -0,0 +1,28 @@ +-- small no part, 4 bucket & big no part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small; + +CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big; +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big; + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.AvgPartitionSizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (working copy) @@ -0,0 +1,31 @@ +-- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher; + +-- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (working copy) @@ -0,0 +1,31 @@ +-- small no part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small; + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.mapjoin.check.memory.rows=1; +set hive.mapjoin.localtask.max.memory.usage=0.0000000001; + +-- The hash table can only hold a very small amount (based on the parameter hive.mapjoin.localtask.max.memory.usage) +-- so, the attempt to convert to map-join should fail. + +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,103 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/* + * This is a pluggable policy to choose the candidate map-join table for converting a join to a + * sort merge join. The largest table is chosen based on the size of the tables. + */ +public class AvgPartitionSizeSortMergeJoinBigTableMatcher + extends CommonSizeSortMergeJoinBigTableMatcher + implements SortMergeJoinBigTableMatcher { + + private static final Log LOG = LogFactory + .getLog(AvgPartitionSizeSortMergeJoinBigTableMatcher.class.getName()); + + public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) + throws SemanticException { + int bigTablePos = 0; + long maxSize = 0; + int numPartitionsCurrentBigTable = 0; // number of partitions for the chosen big table + HiveConf conf = parseCtx.getConf(); + + try { + List topOps = new ArrayList(); + getListTopOps(joinOp, topOps); + int currentPos = 0; + for (TableScanOperator topOp : topOps) { + int numPartitions = 1; // in case the sizes match, preference is + // given to the table with fewer partitions + Table table = parseCtx.getTopToTable().get(topOp); + long averageSize = 0; + + if (!table.isPartitioned()) { + averageSize = getSize(conf, table); + } + else { + // For partitioned tables, get the size of all the partitions + PrunedPartitionList partsList = + PartitionPruner.prune(parseCtx.getTopToTable().get(topOp), + parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(), + null, parseCtx.getPrunedPartitions()); + numPartitions = partsList.getNotDeniedPartns().size(); + long totalSize = 0; + for (Partition part : partsList.getNotDeniedPartns()) { + totalSize += getSize(conf, part); + } + averageSize = totalSize/numPartitions; + } + + if (averageSize > maxSize) { + maxSize = averageSize; + bigTablePos = currentPos; + numPartitionsCurrentBigTable = numPartitions; + } + // If the sizes match, prefer the table with fewer partitions + else if (averageSize == maxSize) { + if (numPartitions < numPartitionsCurrentBigTable) { + bigTablePos = currentPos; + numPartitionsCurrentBigTable = numPartitions; + } + } + + currentPos++; + } + } catch (HiveException e) { + throw new SemanticException(e.getMessage()); + } + + return bigTablePos; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (working copy) @@ -0,0 +1,543 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.Order; +import org.apache.hadoop.hive.ql.exec.DummyStoreOperator; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.QB; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.util.ReflectionUtils; + +//try to replace a bucket map join with a sorted merge map join +abstract public class AbstractSMBJoinProc extends AbstractBucketJoinProc implements NodeProcessor { + + private static final Log LOG = LogFactory + .getLog(SortedMergeBucketMapJoinOptimizer.class.getName()); + + public AbstractSMBJoinProc(ParseContext pctx) { + super(pctx); + } + + public AbstractSMBJoinProc() { + super(); + } + + @Override + abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException; + + // Return true or false based on whether the mapjoin was converted successfully to + // a sort-merge map join operator. + protected boolean canConvertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, + Stack stack, + SortBucketJoinOptProcCtx smbJoinContext, + Object... nodeOutputs) throws SemanticException { + + if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null + || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) { + return false; + } + + boolean tableSorted = true; + QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext() + .get(mapJoinOp); + if (joinCxt == null) { + return false; + } + String[] srcs = joinCxt.getBaseSrc(); + for (int srcPos = 0; srcPos < srcs.length; srcPos++) { + srcs[srcPos] = QB.getAppendedAliasFromId(joinCxt.getId(), srcs[srcPos]); + } + + // All the tables/partitions columns should be sorted in the same order + // For example, if tables A and B are being joined on columns c1, c2 and c3 + // which are the sorted and bucketed columns. The join would work, as long + // c1, c2 and c3 are sorted in the same order. + List sortColumnsFirstTable = new ArrayList(); + + for (int pos = 0; pos < srcs.length; pos++) { + tableSorted = tableSorted + && isTableSorted(smbJoinContext, + pGraphContext, + mapJoinOp.getConf().getKeys().get((byte) pos), + joinCxt, + srcs, + pos, + sortColumnsFirstTable); + } + if (!tableSorted) { + // this is a mapjoin but not suited for a sort merge bucket map join. check outer joins + MapJoinProcessor.checkMapJoin(mapJoinOp.getConf().getPosBigTable(), + mapJoinOp.getConf().getConds()); + return false; + } + + smbJoinContext.setSrcs(srcs); + return true; + } + + + // Convert the bucket map-join operator to a sort-merge map join operator + protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext parseContext) { + + String[] srcs = smbJoinContext.getSrcs(); + SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp); + SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf()); + smbJop.setConf(smbJoinDesc); + HashMap tagToAlias = new HashMap(); + for (int i = 0; i < srcs.length; i++) { + tagToAlias.put((byte) i, srcs[i]); + } + smbJoinDesc.setTagToAlias(tagToAlias); + + int indexInListMapJoinNoReducer = + this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp); + if (indexInListMapJoinNoReducer >= 0 ) { + this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer); + this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop); + } + + Map aliasToSink = + new HashMap(); + // For all parents (other than the big table), insert a dummy store operator + /* Consider a query like: + * + * select * from + * (subq1 --> has a filter) + * join + * (subq2 --> has a filter) + * on some key + * + * Let us assume that subq1 is the small table (either specified by the user or inferred + * automatically). The following operator tree will be created: + * + * TableScan (subq1) --> Select --> Filter --> DummyStore + * \ + * \ SMBJoin + * / + * / + * TableScan (subq2) --> Select --> Filter + */ + + List> parentOperators = mapJoinOp.getParentOperators(); + for (int i = 0; i < parentOperators.size(); i++) { + Operator par = parentOperators.get(i); + int index = par.getChildOperators().indexOf(mapJoinOp); + par.getChildOperators().remove(index); + if (i == smbJoinDesc.getPosBigTable()) { + par.getChildOperators().add(index, smbJop); + } + else { + DummyStoreOperator dummyStoreOp = new DummyStoreOperator(); + par.getChildOperators().add(index, dummyStoreOp); + + List> childrenOps = + new ArrayList>(); + childrenOps.add(smbJop); + dummyStoreOp.setChildOperators(childrenOps); + + List> parentOps = + new ArrayList>(); + parentOps.add(par); + dummyStoreOp.setParentOperators(parentOps); + + aliasToSink.put(srcs[i], dummyStoreOp); + smbJop.getParentOperators().remove(i); + smbJop.getParentOperators().add(i, dummyStoreOp); + } + } + smbJoinDesc.setAliasToSink(aliasToSink); + + List> childOps = mapJoinOp.getChildOperators(); + for (int i = 0; i < childOps.size(); i++) { + Operator child = childOps.get(i); + int index = child.getParentOperators().indexOf(mapJoinOp); + child.getParentOperators().remove(index); + child.getParentOperators().add(index, smbJop); + } + parseContext.getSmbMapJoinContext().put(smbJop, + parseContext.getMapJoinContext().get(mapJoinOp)); + parseContext.getMapJoinContext().remove(mapJoinOp); + parseContext.getOpParseCtx().put(smbJop, parseContext.getOpParseCtx().get(mapJoinOp)); + + return smbJop; + } + + /** + * Whether this table is eligible for a sort-merge join. + * + * @param pctx parse context + * @param op map join operator being considered + * @param joinTree join tree being considered + * @param alias table alias in the join tree being checked + * @param pos position of the table + * @param sortColumnsFirstTable The names and order of the sorted columns for the first table. + * It is not initialized when pos = 0. + * @return + * @throws SemanticException + */ + private boolean isTableSorted( + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext pctx, + List keys, + QBJoinTree joinTree, + String[] aliases, + int pos, + List sortColumnsFirstTable) throws SemanticException { + String alias = aliases[pos]; + HashMap> topOps = this.pGraphContext.getTopOps(); + Map topToTable = this.pGraphContext + .getTopToTable(); + + + /* + * Consider a query like: + * + * select -- mapjoin(subq1) -- * from + * (select a.key, a.value from tbl1 a) subq1 + * join + * (select a.key, a.value from tbl2 a) subq2 + * on subq1.key = subq2.key; + * + * aliasToOpInfo contains the SelectOperator for subq1 and subq2. + * We need to traverse the tree (using TableAccessAnalyzer) to get to the base + * table. If the object being map-joined is a base table, then aliasToOpInfo + * contains the TableScanOperator, and TableAccessAnalyzer is a no-op. + */ + Operator topOp = joinTree.getAliasToOpInfo().get(alias); + if (topOp == null) { + return false; + } + + // get all join columns from join keys + List joinCols = new ArrayList(); + List joinKeys = new ArrayList(); + joinKeys.addAll(keys); + while (joinKeys.size() > 0) { + ExprNodeDesc node = joinKeys.remove(0); + if (node instanceof ExprNodeColumnDesc) { + joinCols.addAll(node.getCols()); + } else if (node instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node); + GenericUDF udf = udfNode.getGenericUDF(); + if (!FunctionRegistry.isDeterministic(udf)) { + return false; + } + joinKeys.addAll(0, udfNode.getChildExprs()); + } + } + + if (joinCols == null || joinCols.isEmpty()) { + return false; + } + TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols); + if (tso == null) { + return false; + } + + // For nested sub-queries, the alias mapping is not maintained in QB currently. + /* + * Consider a query like: + * + * select count(*) from + * ( + * select key, count(*) from + * ( + * select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2 + * from tbl1 a join tbl2 b on a.key = b.key + * ) subq1 + * group by key + * ) subq2; + * + * The table alias should be subq2:subq1:a which needs to be fetched from topOps. + */ + if (pGraphContext.getTopOps().containsValue(tso)) { + for (Map.Entry> topOpEntry : + this.pGraphContext.getTopOps().entrySet()) { + if (topOpEntry.getValue() == tso) { + alias = topOpEntry.getKey(); + aliases[pos] = alias; + break; + } + } + } + else { + // Ideally, this should never happen, and this should be an assert. + return false; + } + + Table tbl = topToTable.get(tso); + if (tbl.isPartitioned()) { + PrunedPartitionList prunedParts = null; + try { + prunedParts = pGraphContext.getOpToPartList().get(tso); + if (prunedParts == null) { + prunedParts = PartitionPruner.prune(tbl, pGraphContext + .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias, + pGraphContext.getPrunedPartitions()); + pGraphContext.getOpToPartList().put(tso, prunedParts); + } + } catch (HiveException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + throw new SemanticException(e.getMessage(), e); + } + List partitions = prunedParts.getNotDeniedPartns(); + // Populate the names and order of columns for the first partition of the + // first table + if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) { + Partition firstPartition = partitions.get(0); + sortColumnsFirstTable.addAll(firstPartition.getSortCols()); + } + + for (Partition partition : prunedParts.getNotDeniedPartns()) { + if (!checkSortColsAndJoinCols(partition.getSortCols(), + joinCols, + sortColumnsFirstTable)) { + return false; + } + } + return true; + } + + // Populate the names and order of columns for the first table + if (pos == 0) { + sortColumnsFirstTable.addAll(tbl.getSortCols()); + } + + return checkSortColsAndJoinCols(tbl.getSortCols(), + joinCols, + sortColumnsFirstTable); + } + + private boolean checkSortColsAndJoinCols(List sortCols, + List joinCols, + List sortColumnsFirstPartition) { + + if (sortCols == null || sortCols.size() < joinCols.size()) { + return false; + } + + // A join is eligible for a sort-merge join, only if it is eligible for + // a bucketized map join. So, we dont need to check for bucketized map + // join here. We are guaranteed that the join keys contain all the + // bucketized keys (note that the order need not be the same). + List sortColNames = new ArrayList(); + + // The join columns should contain all the sort columns + // The sort columns of all the tables should be in the same order + // compare the column names and the order with the first table/partition. + for (int pos = 0; pos < sortCols.size(); pos++) { + Order o = sortCols.get(pos); + + if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) { + return false; + } + sortColNames.add(o.getCol()); + } + + // The column names and order (ascending/descending) matched + // The first 'n' sorted columns should be the same as the joinCols, where + // 'n' is the size of join columns. + // For eg: if the table is sorted by (a,b,c), it is OK to convert if the join is + // on (a), (a,b), or any combination of (a,b,c): + // (a,b,c), (a,c,b), (c,a,b), (c,b,a), (b,c,a), (b,a,c) + // but it is not OK to convert if the join is on (a,c) + return sortColNames.subList(0, joinCols.size()).containsAll(joinCols); + } + + // Can the join operator be converted to a sort-merge join operator ? + // It is already verified that the join can be converted to a bucket map join + protected boolean checkConvertJoinToSMBJoin( + JoinOperator joinOperator, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext pGraphContext) throws SemanticException { + + boolean tableSorted = true; + QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOperator); + + if (joinCtx == null) { + return false; + } + String[] srcs = joinCtx.getBaseSrc(); + + // All the tables/partitions columns should be sorted in the same order + // For example, if tables A and B are being joined on columns c1, c2 and c3 + // which are the sorted and bucketed columns. The join would work, as long + // c1, c2 and c3 are sorted in the same order. + List sortColumnsFirstTable = new ArrayList(); + + for (int pos = 0; pos < srcs.length; pos++) { + tableSorted = tableSorted && + isTableSorted(smbJoinContext, + pGraphContext, + smbJoinContext.getKeyExprMap().get((byte)pos), + joinCtx, + srcs, + pos, + sortColumnsFirstTable); + } + + smbJoinContext.setSrcs(srcs); + return true; + } + + // Can the join operator be converted to a sort-merge join operator ? + protected boolean canConvertJoinToSMBJoin( + JoinOperator joinOperator, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext pGraphContext) throws SemanticException { + boolean canConvert = + canConvertJoinToBucketMapJoin( + joinOperator, + pGraphContext, + smbJoinContext + ); + + if (!canConvert) { + return false; + } + + return checkConvertJoinToSMBJoin(joinOperator, smbJoinContext, pGraphContext); + } + + // Can the join operator be converted to a bucket map-merge join operator ? + protected boolean canConvertJoinToBucketMapJoin( + JoinOperator joinOp, + ParseContext pGraphContext, + SortBucketJoinOptProcCtx context) throws SemanticException { + + // This has already been inspected and rejected + if (context.getSetOfRejectedJoins().contains(joinOp)) { + return false; + } + + QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOp); + if (joinCtx == null) { + return false; + } + + Class bigTableMatcherClass = null; + try { + bigTableMatcherClass = + (Class) + (Class.forName(HiveConf.getVar(pGraphContext.getConf(), + HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER))); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e.getMessage()); + } + + SortMergeJoinBigTableMatcher bigTableMatcher = + (SortMergeJoinBigTableMatcher) ReflectionUtils.newInstance(bigTableMatcherClass, null); + int bigTablePosition = + bigTableMatcher.getBigTablePosition(pGraphContext, joinOp); + context.setBigTablePosition(bigTablePosition); + String joinAlias = + bigTablePosition == 0 ? + joinCtx.getLeftAlias() : joinCtx.getRightAliases()[bigTablePosition - 1]; + + Map> keyExprMap = new HashMap>(); + List> parentOps = joinOp.getParentOperators(); + // get the join keys from parent ReduceSink operators + for (Operator parentOp : parentOps) { + ReduceSinkDesc rsconf = ((ReduceSinkOperator)parentOp).getConf(); + Byte tag = (byte) rsconf.getTag(); + List keys = rsconf.getKeyCols(); + keyExprMap.put(tag, keys); + } + + context.setKeyExprMap(keyExprMap); + + // Given a candidate map-join, can this join be converted. + // The candidate map-join was derived from the pluggable sort merge join big + // table matcher. + return checkConvertBucketMapJoin( + pGraphContext, + context, + joinCtx, + keyExprMap, + joinAlias, + Arrays.asList(joinCtx.getBaseSrc())); + } + + // Convert the join operator to a bucket map-join join operator + protected MapJoinOperator convertJoinToBucketMapJoin( + JoinOperator joinOp, + SortBucketJoinOptProcCtx joinContext, + ParseContext parseContext) throws SemanticException { + MapJoinOperator mapJoinOp = MapJoinProcessor.convertMapJoin( + parseContext.getOpParseCtx(), + joinOp, + pGraphContext.getJoinContext().get(joinOp), + joinContext.getBigTablePosition(), + false); + // Remove the join operator from the query join context + parseContext.getMapJoinContext().put(mapJoinOp, parseContext.getJoinContext().get(joinOp)); + parseContext.getJoinContext().remove(joinOp); + convertMapJoinToBucketMapJoin(mapJoinOp, joinContext); + return mapJoinOp; + } + + // Convert the join operator to a sort-merge join operator + protected void convertJoinToSMBJoin( + JoinOperator joinOp, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext parseContext) throws SemanticException { + MapJoinOperator mapJoinOp = convertJoinToBucketMapJoin(joinOp, smbJoinContext, parseContext); + SMBMapJoinOperator smbMapJoinOp = + convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext, parseContext); + smbMapJoinOp.setConvertedAutomaticallySMBJoin(true); + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (working copy) @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; + + +public class SortBucketJoinOptProcCtx extends BucketJoinOptProcCtx { + private String[] srcs; + private int bigTablePosition; + private Map> keyExprMap; + + public SortBucketJoinOptProcCtx(HiveConf conf) { + super(conf); + } + + public String[] getSrcs() { + return srcs; + } + + public void setSrcs(String[] srcs) { + this.srcs = srcs; + } + + public int getBigTablePosition() { + return bigTablePosition; + } + + public void setBigTablePosition(int bigTablePosition) { + this.bigTablePosition = bigTablePosition; + } + + public Map> getKeyExprMap() { + return keyExprMap; + } + + public void setKeyExprMap(Map> keyExprMap) { + this.keyExprMap = keyExprMap; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (working copy) @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.ql.exec.OperatorFactory; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.ScriptOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; @@ -73,6 +74,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.serde.serdeConstants; @@ -119,7 +121,7 @@ ArrayList smallTableAliasList = new ArrayList(); String bigTableAlias = null; - // create a new MapredLocalWork + // create a new MapredLocalWork MapredLocalWork newLocalWork = new MapredLocalWork( new LinkedHashMap>(), new LinkedHashMap()); @@ -202,7 +204,7 @@ // set alias to fetch work newLocalWork.getAliasToFetchWork().put(alias, fetchWork); } - // remove small table ailias from aliasToWork;Avoid concurrent modification + // remove small table alias from aliasToWork;Avoid concurrent modification for (String alias : smallTableAliasList) { newWork.getAliasToWork().remove(alias); } @@ -218,31 +220,25 @@ return bigTableAlias; } - public static String genMapJoinOpAndLocalWork(MapredWork newWork, JoinOperator op, int mapJoinPos) - throws SemanticException { - try { - LinkedHashMap, OpParseContext> opParseCtxMap = - newWork.getOpParseCtxMap(); - QBJoinTree newJoinTree = newWork.getJoinTree(); - // generate the map join operator; already checked the map join - MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op, - newJoinTree, mapJoinPos, true); - // generate the local work and return the big table alias - String bigTableAlias = MapJoinProcessor - .genMapJoinLocalWork(newWork, newMapJoinOp, mapJoinPos); - // clean up the mapred work - newWork.setOpParseCtxMap(null); - newWork.setJoinTree(null); + public static String genMapJoinOpAndLocalWork(MapredWork newWork, + MapJoinOperator newMapJoinOp, int mapJoinPos) + throws SemanticException { + try { + // generate the local work and return the big table alias + String bigTableAlias = MapJoinProcessor + .genMapJoinLocalWork(newWork, newMapJoinOp, mapJoinPos); + // clean up the mapred work + newWork.setOpParseCtxMap(null); + newWork.setJoinTree(null); - return bigTableAlias; + return bigTableAlias; - } catch (Exception e) { - e.printStackTrace(); - throw new SemanticException("Generate New MapJoin Opertor Exeception " + e.getMessage()); + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate New MapJoin Opertor Exeception " + e.getMessage()); + } } - } - /** * convert a regular join to a a map-side join. * @@ -257,7 +253,7 @@ * @param noCheckOuterJoin */ public static MapJoinOperator convertMapJoin( - LinkedHashMap, OpParseContext> opParseCtxMap, + Map, OpParseContext> opParseCtxMap, JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin) throws SemanticException { // outer join cannot be performed on a table which is being cached @@ -480,6 +476,66 @@ return mapJoinOp; } + + /** + * convert a sortmerge join to a a map-side join. + * + * @param opParseCtxMap + * @param op + * join operator + * @param joinTree + * qb join tree + * @param mapJoinPos + * position of the source to be read as part of map-reduce framework. All other sources + * are cached in memory + * @param noCheckOuterJoin + */ + public static MapJoinOperator convertMapJoin( + Map, OpParseContext> opParseCtxMap, + SMBMapJoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin) + throws SemanticException { + // Create a new map join operator + SMBJoinDesc smbJoinDesc = op.getConf(); + List keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0)); + TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(PlanUtils + .getFieldSchemasFromColumnList(keyCols, "mapjoinkey")); + MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), + keyTableDesc, smbJoinDesc.getExprs(), + smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), + smbJoinDesc.getOutputColumnNames(), + mapJoinPos, smbJoinDesc.getConds(), + smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix()); + + RowResolver joinRS = opParseCtxMap.get(op).getRowResolver(); + MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild( + mapJoinDesc, joinRS.getRowSchema(), + new ArrayList>()); + + OpParseContext ctx = new OpParseContext(joinRS); + opParseCtxMap.put(mapJoinOp, ctx); + + + // change the children of the original join operator to point to the map + // join operator + List> childOps = op.getChildOperators(); + for (Operator childOp : childOps) { + childOp.replaceParent(op, mapJoinOp); + } + mapJoinOp.setChildOperators(childOps); + op.setChildOperators(null); + + // change the parent of the original join operator to point to the map + // join operator + List> parentOps = op.getParentOperators(); + for (Operator parentOp : parentOps) { + parentOp.replaceChild(op, mapJoinOp); + } + mapJoinOp.setParentOperators(parentOps); + op.setParentOperators(null); + + return mapJoinOp; + } + public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator op, QBJoinTree joinTree, int mapJoinPos) throws SemanticException { HiveConf hiveConf = pctx.getConf(); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (working copy) @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.util.ArrayList; + +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; +import org.apache.hadoop.hive.ql.parse.SemanticException; + + +public class SortMergeJoinResolver implements PhysicalPlanResolver { + @Override + public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { + + // create dispatcher and graph walker + Dispatcher disp = new SortMergeJoinTaskDispatcher(pctx); + TaskGraphWalker ogw = new TaskGraphWalker(disp); + + // get all the tasks nodes from root task + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.rootTasks); + + // begin to walk through the task tree. + ogw.startWalking(topNodes, null); + return pctx; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (working copy) @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/** + * Iterator each tasks. If this task has a local work,create a new task for this local work, named + * MapredLocalTask. then make this new generated task depends on current task's parent task, and + * make current task depends on this new generated task + */ +public class CommonJoinTaskDispatcher extends JoinTaskDispatcher implements Dispatcher { + + public CommonJoinTaskDispatcher(PhysicalContext context) { + super(context); + } + + @Override + public ConditionalTask processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException { + + // whether it contains common join op; if contains, return this common join op + JoinOperator joinOp = getJoinOp(currTask); + if (joinOp == null) { + return null; + } + currTask.setTaskTag(Task.COMMON_JOIN); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp); + + return processCurrentTask(currTask, currTask.getWork(), + conditionalTask, context, + joinTree, joinOp.getConf()); + } + + @Override + public MapJoinOperator getMapJoinOperator(MapRedTask task, + MapredWork work, QBJoinTree joinTree, int mapJoinPos) throws SemanticException { + JoinOperator newJoinOp = getJoinOp(task); + // generate the map join operator; already checked the map join + return MapJoinProcessor.convertMapJoin(work.getOpParseCtxMap(), newJoinOp, + joinTree, mapJoinPos, true); + } + + + private JoinOperator getJoinOp(MapRedTask task) throws SemanticException { + if (task.getWork() == null) { + return null; + } + Operator reducerOp = task.getWork().getReducer(); + if (reducerOp instanceof JoinOperator) { + return (JoinOperator) reducerOp; + } else { + return null; + } + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (working copy) @@ -52,6 +52,10 @@ if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) { resolvers.add(new CommonJoinResolver()); } + if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN)) { + resolvers.add(new SortMergeJoinResolver()); + } + if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER)) { resolvers.add(new IndexWhereResolver()); } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (working copy) @@ -0,0 +1,229 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.FetchWork; +import org.apache.hadoop.hive.ql.plan.MapredLocalWork; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; + +/** + * Iterator over each task. If the task has a smb join, convert the task to a conditional task. + * The conditional task will first try all mapjoin possibilities, and go the the smb join if the + * mapjoin fails. The smb join will be a backup task for all the mapjoin tasks. + */ +public class SortMergeJoinTaskDispatcher extends JoinTaskDispatcher implements Dispatcher { + + public SortMergeJoinTaskDispatcher(PhysicalContext context) { + super(context); + } + + // Convert the work in the SMB plan to a regular join + // Note that the operator tree is not fixed, only the path/alias mappings in the + // plan are fixed + private void genSMBJoinWork(MapredWork currWork, SMBMapJoinOperator joinOp) { + // Remove the paths which are not part of aliasToPartitionInfo + Map aliasToPartitionInfo = currWork.getAliasToPartnInfo(); + List removePaths = new ArrayList(); + + for (Map.Entry> entry : currWork.getPathToAliases().entrySet()) { + boolean keepPath = false; + for (String alias : entry.getValue()) { + if (aliasToPartitionInfo.containsKey(alias)) { + keepPath = true; + break; + } + } + + // Remove if the path is not present + if (!keepPath) { + removePaths.add(entry.getKey()); + } + } + + List removeAliases = new ArrayList(); + for (String removePath : removePaths) { + removeAliases.addAll(currWork.getPathToAliases().get(removePath)); + currWork.getPathToAliases().remove(removePath); + currWork.getPathToPartitionInfo().remove(removePath); + } + + for (String alias : removeAliases) { + currWork.getAliasToPartnInfo().remove(alias); + currWork.getAliasToWork().remove(alias); + } + + // Get the MapredLocalWork + MapredLocalWork localWork = joinOp.getConf().getLocalWork(); + + for (Map.Entry> entry : localWork.getAliasToWork() + .entrySet()) { + String alias = entry.getKey(); + Operator op = entry.getValue(); + FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias); + + // Add the entry in mapredwork + currWork.getAliasToWork().put(alias, op); + + PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias); + if (fetchWork.getTblDir() != null) { + ArrayList aliases = new ArrayList(); + aliases.add(alias); + currWork.getPathToAliases().put(fetchWork.getTblDir(), aliases); + currWork.getPathToPartitionInfo().put(fetchWork.getTblDir(), partitionInfo); + } + else { + for (String pathDir : fetchWork.getPartDir()) { + ArrayList aliases = new ArrayList(); + aliases.add(alias); + currWork.getPathToAliases().put(pathDir, aliases); + currWork.getPathToPartitionInfo().put(pathDir, partitionInfo); + } + } + } + } + + private MapredWork convertSMBToJoin(MapredWork currWork, SMBMapJoinOperator oldJoinOp) + throws SemanticException { + try { + String xml = currWork.toXML(); + + // create map join task and set big table as i + // deep copy a new mapred work from xml + InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); + MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); + SMBMapJoinOperator newJoinOp = getSMBMapJoinOp(newWork); + + // Add the row resolver for the new operator + Map, OpParseContext> opParseContextMap = + physicalContext.getParseContext().getOpParseCtx(); + opParseContextMap.put(newJoinOp, opParseContextMap.get(oldJoinOp)); + // create a mapred task for this work + genSMBJoinWork(newWork, newJoinOp); + return newWork; + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + } + + @Override + public ConditionalTask processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException { + + // whether it contains a sort merge join op + SMBMapJoinOperator joinOp = getSMBMapJoinOp(currTask); + if (joinOp == null) { + return null; + } + if (!joinOp.isConvertedAutomaticallySMBJoin()) { + return null; + } + currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN); + + MapredWork currWork = currTask.getWork(); + + // Convert the sort-merge join into a regular join. + // Then, convert all the join into a series of map-joins. + MapredWork currJoinWork = convertSMBToJoin(currWork, joinOp); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + QBJoinTree joinTree = parseCtx.getSmbMapJoinContext().get(joinOp); + + return processCurrentTask(currTask, currJoinWork, + conditionalTask, context, joinTree, joinOp.getConf()); + } + + private SMBMapJoinOperator getSMBMapJoinOp(Operator currOp) { + if (currOp instanceof SMBMapJoinOperator) { + return (SMBMapJoinOperator) currOp; + } + + List> childOps = currOp.getChildOperators(); + if ((childOps == null) || (childOps.isEmpty())) { + return null; + } + + for (Operator childOp : childOps) { + SMBMapJoinOperator ret = getSMBMapJoinOp(childOp); + if (ret != null) { + return ret; + } + } + + return null; + } + + private SMBMapJoinOperator getSMBMapJoinOp(MapredWork work) throws SemanticException { + for (Operator op : work.getAliasToWork().values()) { + SMBMapJoinOperator smbMapJoinOp = getSMBMapJoinOp(op); + if (smbMapJoinOp != null) { + return smbMapJoinOp; + } + } + + return null; + } + + private SMBMapJoinOperator getSMBMapJoinOp(MapRedTask task) throws SemanticException { + if (task.getWork() == null) { + return null; + } + return getSMBMapJoinOp(task.getWork()); + } + + + @Override + public MapJoinOperator getMapJoinOperator(MapRedTask task, + MapredWork work, QBJoinTree joinTree, int mapJoinPos) throws SemanticException { + SMBMapJoinOperator joinOp = getSMBMapJoinOp(task); + + // Add the row resolver for the new operator + Map,OpParseContext> opParseContextMap = + physicalContext.getParseContext().getOpParseCtx(); + opParseContextMap.put(joinOp, opParseContextMap.get(joinOp)); + + // generate the map join operator; already checked the map join + return MapJoinProcessor.convertMapJoin(work.getOpParseCtxMap(), joinOp, + joinTree, mapJoinPos, true); + + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/JoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/JoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/JoinTaskDispatcher.java (working copy) @@ -0,0 +1,276 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; +import + org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; +import org.apache.hadoop.hive.ql.plan.ConditionalWork; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.MapredWork; + +/** + * Iterator over each tasks. This is a common class, where the task satisfying a certain condition + * (which is pluggable) is converted. An example is where a sort-merge join is converted to a + * conditional task, with mapjoin tasks and treating the smb join as a backup task in case the + * mapjoin fails. + */ +public abstract class JoinTaskDispatcher implements Dispatcher { + + protected final PhysicalContext physicalContext; + + public JoinTaskDispatcher(PhysicalContext context) { + super(); + physicalContext = context; + } + + protected ConditionalTask processCurrentTask(MapRedTask currTask, + MapredWork currWork, + ConditionalTask conditionalTask, Context context, + QBJoinTree joinTree, JoinDesc joinDesc) + throws SemanticException { + + // create conditional work list and task list + List listWorks = new ArrayList(); + List> listTasks = new ArrayList>(); + + // create alias to task mapping and alias to input file mapping for resolver + HashMap> aliasToTask = + new HashMap>(); + HashMap> pathToAliases = currWork.getPathToAliases(); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + + // start to generate multiple map join tasks + Byte[] order = joinDesc.getTagOrder(); + int numAliases = order.length; + + long aliasTotalKnownInputSize = 0; + HashMap aliasToSize = new HashMap(); + try { + // go over all the input paths, and calculate a known total size, known + // size for each input alias. + Utilities.getInputSummary(context, currWork, null).getLength(); + + // set alias to size mapping, this can be used to determine if one table + // is chosen as big table, what's the total size of left tables, which + // are going to be small tables. + for (Map.Entry> entry : pathToAliases.entrySet()) { + String path = entry.getKey(); + List aliasList = entry.getValue(); + ContentSummary cs = context.getCS(path); + if (cs != null) { + long size = cs.getLength(); + for (String alias : aliasList) { + aliasTotalKnownInputSize += size; + Long es = aliasToSize.get(alias); + if (es == null) { + es = new Long(0); + } + es += size; + aliasToSize.put(alias, es); + } + } + } + + HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc + .getConds()); + + // no table could be the big table; there is no need to convert + if (bigTableCandidates == null) { + return null; + } + currWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); + currWork.setJoinTree(joinTree); + + String xml = currWork.toXML(); + String bigTableAlias = null; + + long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(context.getConf(), + HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); + for (int i = 0; i < numAliases; i++) { + // this table cannot be big table + if (!bigTableCandidates.contains(i)) { + continue; + } + + // create map join task and set big table as i + // deep copy a new mapred work from xml + InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); + MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); + // create a mapred task for this work + MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext + .getParseContext().getConf()); + + // generate the map join operator; already checked the map join + MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, joinTree, i); + bigTableAlias = MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newMapJoinOp, i); + + Long aliasKnownSize = aliasToSize.get(bigTableAlias); + if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { + long smallTblTotalKnownSize = aliasTotalKnownInputSize + - aliasKnownSize.longValue(); + if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { + // this table is not good to be a big table. + continue; + } + } + + // add into conditional task + listWorks.add(newWork); + listTasks.add(newTask); + newTask.setTaskTag(Task.CONVERTED_MAPJOIN); + + // set up backup task + newTask.setBackupTask(currTask); + newTask.setBackupChildrenTasks(currTask.getChildTasks()); + + // put the mapping alias to task + aliasToTask.put(bigTableAlias, newTask); + } + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + + // insert current common join task to conditional task + listWorks.add(currTask.getWork()); + listTasks.add(currTask); + // clear JoinTree and OP Parse Context + currWork.setOpParseCtxMap(null); + currWork.setJoinTree(null); + + // create conditional task and insert conditional task into task tree + ConditionalWork cndWork = new ConditionalWork(listWorks); + ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf()); + cndTsk.setListTasks(listTasks); + + // set resolver and resolver context + cndTsk.setResolver(new ConditionalResolverCommonJoin()); + ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx(); + resolverCtx.setPathToAliases(pathToAliases); + resolverCtx.setAliasToKnownSize(aliasToSize); + resolverCtx.setAliasToTask(aliasToTask); + resolverCtx.setCommonJoinTask(currTask); + resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false)); + resolverCtx.setHdfsTmpDir(context.getMRScratchDir()); + cndTsk.setResolverCtx(resolverCtx); + + // replace the current task with the new generated conditional task + this.replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext); + return cndTsk; + } + + private void replaceTaskWithConditionalTask( + Task currTask, ConditionalTask cndTsk, + PhysicalContext physicalContext) { + // add this task into task tree + // set all parent tasks + List> parentTasks = currTask.getParentTasks(); + currTask.setParentTasks(null); + if (parentTasks != null) { + for (Task tsk : parentTasks) { + // make new generated task depends on all the parent tasks of current task. + tsk.addDependentTask(cndTsk); + // remove the current task from its original parent task's dependent task + tsk.removeDependentTask(currTask); + } + } else { + // remove from current root task and add conditional task to root tasks + physicalContext.removeFromRootTask(currTask); + physicalContext.addToRootTask(cndTsk); + } + // set all child tasks + List> oldChildTasks = currTask.getChildTasks(); + if (oldChildTasks != null) { + for (Task tsk : cndTsk.getListTasks()) { + if (tsk.equals(currTask)) { + continue; + } + for (Task oldChild : oldChildTasks) { + tsk.addDependentTask(oldChild); + } + } + } + } + + @Override + public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) + throws SemanticException { + if (nodeOutputs == null || nodeOutputs.length == 0) { + throw new SemanticException("No Dispatch Context"); + } + + TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0]; + + Task currTask = (Task) nd; + // not map reduce task or not conditional task, just skip + if (currTask.isMapRedTask()) { + if (currTask instanceof ConditionalTask) { + // get the list of task + List> taskList = ((ConditionalTask) currTask).getListTasks(); + for (Task tsk : taskList) { + if (tsk.isMapRedTask()) { + ConditionalTask cndTask = this.processCurrentTask((MapRedTask) tsk, + ((ConditionalTask) currTask), physicalContext.getContext()); + walkerCtx.addToDispatchList(cndTask); + } + } + } else { + ConditionalTask cndTask = this.processCurrentTask((MapRedTask) currTask, null, + physicalContext.getContext()); + walkerCtx.addToDispatchList(cndTask); + } + } + return null; + } + + public abstract ConditionalTask processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException; + + public abstract MapJoinOperator getMapJoinOperator(MapRedTask task, + MapredWork qork, QBJoinTree joinTree, int mapJoinPos) throws SemanticException; +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (working copy) @@ -17,41 +17,12 @@ */ package org.apache.hadoop.hive.ql.optimizer.physical; -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.io.Serializable; import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Stack; -import org.apache.hadoop.fs.ContentSummary; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.Context; -import org.apache.hadoop.hive.ql.exec.ConditionalTask; -import org.apache.hadoop.hive.ql.exec.JoinOperator; -import org.apache.hadoop.hive.ql.exec.MapRedTask; -import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.exec.TaskFactory; -import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; -import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext; -import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; -import org.apache.hadoop.hive.ql.parse.ParseContext; -import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; -import - org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; -import org.apache.hadoop.hive.ql.plan.ConditionalWork; -import org.apache.hadoop.hive.ql.plan.JoinDesc; -import org.apache.hadoop.hive.ql.plan.MapredWork; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; public class CommonJoinResolver implements PhysicalPlanResolver { @@ -70,239 +41,4 @@ ogw.startWalking(topNodes, null); return pctx; } - - - /** - * Iterator each tasks. If this task has a local work,create a new task for this local work, named - * MapredLocalTask. then make this new generated task depends on current task's parent task, and - * make current task depends on this new generated task - */ - class CommonJoinTaskDispatcher implements Dispatcher { - - private final PhysicalContext physicalContext; - - public CommonJoinTaskDispatcher(PhysicalContext context) { - super(); - physicalContext = context; - } - - private ConditionalTask processCurrentTask(MapRedTask currTask, - ConditionalTask conditionalTask, Context context) - throws SemanticException { - - // whether it contains common join op; if contains, return this common join op - JoinOperator joinOp = getJoinOp(currTask); - if (joinOp == null) { - return null; - } - currTask.setTaskTag(Task.COMMON_JOIN); - - MapredWork currWork = currTask.getWork(); - // create conditional work list and task list - List listWorks = new ArrayList(); - List> listTasks = new ArrayList>(); - - // create alias to task mapping and alias to input file mapping for resolver - HashMap> aliasToTask = new HashMap>(); - HashMap> pathToAliases = currTask.getWork().getPathToAliases(); - - // get parseCtx for this Join Operator - ParseContext parseCtx = physicalContext.getParseContext(); - QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp); - - // start to generate multiple map join tasks - JoinDesc joinDesc = joinOp.getConf(); - Byte[] order = joinDesc.getTagOrder(); - int numAliases = order.length; - - long aliasTotalKnownInputSize = 0; - HashMap aliasToSize = new HashMap(); - try { - // go over all the input paths, and calculate a known total size, known - // size for each input alias. - Utilities.getInputSummary(context, currWork, null).getLength(); - - // set alias to size mapping, this can be used to determine if one table - // is choosen as big table, what's the total size of left tables, which - // are going to be small tables. - for (Map.Entry> entry : pathToAliases.entrySet()) { - String path = entry.getKey(); - List aliasList = entry.getValue(); - ContentSummary cs = context.getCS(path); - if (cs != null) { - long size = cs.getLength(); - for (String alias : aliasList) { - aliasTotalKnownInputSize += size; - Long es = aliasToSize.get(alias); - if(es == null) { - es = new Long(0); - } - es += size; - aliasToSize.put(alias, es); - } - } - } - - HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds()); - - // no table could be the big table; there is no need to convert - if (bigTableCandidates == null) { - return null; - } - currWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); - currWork.setJoinTree(joinTree); - - String xml = currWork.toXML(); - String bigTableAlias = null; - - long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(context.getConf(), - HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); - for (int i = 0; i < numAliases; i++) { - // this table cannot be big table - if (!bigTableCandidates.contains(i)) { - continue; - } - - // create map join task and set big table as i - // deep copy a new mapred work from xml - InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); - MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); - // create a mapred task for this work - MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext - .getParseContext().getConf()); - JoinOperator newJoinOp = getJoinOp(newTask); - - // optimize this newWork and assume big table position is i - bigTableAlias = MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newJoinOp, i); - - Long aliasKnownSize = aliasToSize.get(bigTableAlias); - if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { - long smallTblTotalKnownSize = aliasTotalKnownInputSize - - aliasKnownSize.longValue(); - if(smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { - //this table is not good to be a big table. - continue; - } - } - - // add into conditional task - listWorks.add(newWork); - listTasks.add(newTask); - newTask.setTaskTag(Task.CONVERTED_MAPJOIN); - - //set up backup task - newTask.setBackupTask(currTask); - newTask.setBackupChildrenTasks(currTask.getChildTasks()); - - // put the mapping alias to task - aliasToTask.put(bigTableAlias, newTask); - } - } catch (Exception e) { - e.printStackTrace(); - throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); - } - - // insert current common join task to conditional task - listWorks.add(currTask.getWork()); - listTasks.add(currTask); - // clear JoinTree and OP Parse Context - currWork.setOpParseCtxMap(null); - currWork.setJoinTree(null); - - // create conditional task and insert conditional task into task tree - ConditionalWork cndWork = new ConditionalWork(listWorks); - ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf()); - cndTsk.setListTasks(listTasks); - - // set resolver and resolver context - cndTsk.setResolver(new ConditionalResolverCommonJoin()); - ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx(); - resolverCtx.setPathToAliases(pathToAliases); - resolverCtx.setAliasToKnownSize(aliasToSize); - resolverCtx.setAliasToTask(aliasToTask); - resolverCtx.setCommonJoinTask(currTask); - resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false)); - resolverCtx.setHdfsTmpDir(context.getMRScratchDir()); - cndTsk.setResolverCtx(resolverCtx); - - //replace the current task with the new generated conditional task - this.replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext); - return cndTsk; - } - - private void replaceTaskWithConditionalTask( - Task currTask, ConditionalTask cndTsk, - PhysicalContext physicalContext) { - // add this task into task tree - // set all parent tasks - List> parentTasks = currTask.getParentTasks(); - currTask.setParentTasks(null); - if (parentTasks != null) { - for (Task tsk : parentTasks) { - // make new generated task depends on all the parent tasks of current task. - tsk.addDependentTask(cndTsk); - // remove the current task from its original parent task's dependent task - tsk.removeDependentTask(currTask); - } - } else { - // remove from current root task and add conditional task to root tasks - physicalContext.removeFromRootTask(currTask); - physicalContext.addToRootTask(cndTsk); - } - // set all child tasks - List> oldChildTasks = currTask.getChildTasks(); - if (oldChildTasks != null) { - for (Task tsk : cndTsk.getListTasks()) { - if (tsk.equals(currTask)) { - continue; - } - for (Task oldChild : oldChildTasks) { - tsk.addDependentTask(oldChild); - } - } - } - } - - @Override - public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) - throws SemanticException { - if (nodeOutputs == null || nodeOutputs.length == 0) { - throw new SemanticException("No Dispatch Context"); - } - - TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0]; - - Task currTask = (Task) nd; - // not map reduce task or not conditional task, just skip - if (currTask.isMapRedTask()) { - if (currTask instanceof ConditionalTask) { - // get the list of task - List> taskList = ((ConditionalTask) currTask).getListTasks(); - for (Task tsk : taskList) { - if (tsk.isMapRedTask()) { - ConditionalTask cndTask = this.processCurrentTask((MapRedTask) tsk, - ((ConditionalTask) currTask), physicalContext.getContext()); - walkerCtx.addToDispatchList(cndTask); - } - } - } else { - ConditionalTask cndTask = this.processCurrentTask((MapRedTask) currTask, null, physicalContext.getContext()); - walkerCtx.addToDispatchList(cndTask); - } - } - return null; - } - - private JoinOperator getJoinOp(MapRedTask task) throws SemanticException { - if (task.getWork() == null) { - return null; - } - Operator reducerOp = task.getWork().getReducer(); - if (reducerOp instanceof JoinOperator) { - return (JoinOperator) reducerOp; - } else { - return null; - } - } - } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (working copy) @@ -28,13 +28,13 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.DummyStoreOperator; +import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; -import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; @@ -44,16 +44,8 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.metadata.Partition; -import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.ParseContext; -import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; -import org.apache.hadoop.hive.ql.parse.QB; -import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; @@ -66,9 +58,38 @@ public SortedMergeBucketMapJoinOptimizer() { } + private void getListOfRejectedJoins( + ParseContext pctx, SortBucketJoinOptProcCtx smbJoinContext) + throws SemanticException { + + // go through all joins - it should only contain selects and filters + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", JoinOperator.getOperatorName() + "%"), + getCheckCandidateJoin()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + } + @Override public ParseContext transform(ParseContext pctx) throws SemanticException { + HiveConf conf = pctx.getConf(); + SortBucketJoinOptProcCtx smbJoinContext = + new SortBucketJoinOptProcCtx(conf); + // Get a list of joins which cannot be converted to a sort merge join + // Only selects and filters operators are allowed between the table scan and + // join currently. More operators can be added - the method supportAutomaticSortMergeJoin + // dictates which operator is allowed + getListOfRejectedJoins(pctx, smbJoinContext); + Map opRules = new LinkedHashMap(); // go through all map joins and find out all which have enabled bucket map // join. @@ -76,7 +97,15 @@ getSortedMergeBucketMapjoinProc(pctx)); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along - Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null); + + // There is no need for the user to specify mapjoin for it to be + // converted to sort-merge join + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN)) { + opRules.put(new RuleRegExp("R2", "JOIN%"), + getSortedMergeJoinProc(pctx)); + } + + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes @@ -87,10 +116,41 @@ return pctx; } + class SortedMergeJoinProc extends AbstractSMBJoinProc implements NodeProcessor { + + public SortedMergeJoinProc(ParseContext pctx) { + super(pctx); + } + + public SortedMergeJoinProc() { + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + JoinOperator joinOp = (JoinOperator) nd; + SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx; + + boolean convert = + canConvertJoinToSMBJoin( + joinOp, smbJoinContext, pGraphContext); + + if (convert) { + convertJoinToSMBJoin(joinOp, smbJoinContext, pGraphContext); + } + return null; + } + } + private NodeProcessor getSortedMergeBucketMapjoinProc(ParseContext pctx) { return new SortedMergeBucketMapjoinProc(pctx); } + private NodeProcessor getSortedMergeJoinProc(ParseContext pctx) { + return new SortedMergeJoinProc(pctx); + } + private NodeProcessor getDefaultProc() { return new NodeProcessor() { @Override @@ -102,71 +162,58 @@ }; } - class SortedMergeBucketMapjoinProc extends AbstractBucketJoinProc implements NodeProcessor { - private ParseContext pGraphContext; + // check if the join operator encountered is a candidate for being converted + // to a sort-merge join + private NodeProcessor getCheckCandidateJoin() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx; + JoinOperator joinOperator = (JoinOperator)nd; + int size = stack.size(); + if (!(stack.get(size-1) instanceof JoinOperator) || + !(stack.get(size-2) instanceof ReduceSinkOperator)) { + smbJoinContext.getSetOfRejectedJoins().add(joinOperator); + return null; + } + // If any operator in the stack does not support a auto-conversion, this join should + // not be converted. + for (int pos = size -3; pos >= 0; pos--) { + Operator op = (Operator)stack.get(pos); + if (!op.supportAutomaticSortMergeJoin()) { + smbJoinContext.getSetOfRejectedJoins().add(joinOperator); + return null; + } + } + + return null; + } + }; + } + + class SortedMergeBucketMapjoinProc extends AbstractSMBJoinProc implements NodeProcessor { public SortedMergeBucketMapjoinProc(ParseContext pctx) { - this.pGraphContext = pctx; + super(pctx); } public SortedMergeBucketMapjoinProc() { } - // Return true or false based on whether the mapjoin was converted successfully to - // a sort-merge map join operator. - private boolean convertSMBJoin(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { if (nd instanceof SMBMapJoinOperator) { - return false; + return null; } + MapJoinOperator mapJoinOp = (MapJoinOperator) nd; - if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null - || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) { - return false; - } + SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx; - boolean tableSorted = true; - QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext() - .get(mapJoinOp); - if (joinCxt == null) { - return false; - } - String[] srcs = joinCxt.getBaseSrc(); - for (int srcPos = 0; srcPos < srcs.length; srcPos++) { - srcs[srcPos] = QB.getAppendedAliasFromId(joinCxt.getId(), srcs[srcPos]); - } + boolean convert = + canConvertBucketMapJoinToSMBJoin(mapJoinOp, stack, smbJoinContext, nodeOutputs); - // All the tables/partitions columns should be sorted in the same order - // For example, if tables A and B are being joined on columns c1, c2 and c3 - // which are the sorted and bucketed columns. The join would work, as long - // c1, c2 and c3 are sorted in the same order. - List sortColumnsFirstTable = new ArrayList(); - - for (int pos = 0; pos < srcs.length; pos++) { - tableSorted = tableSorted - && isTableSorted(this.pGraphContext, - mapJoinOp, - joinCxt, - pos, - sortColumnsFirstTable, - srcs); - } - if (!tableSorted) { - //this is a mapjoin but not suit for a sort merge bucket map join. check outer joins - MapJoinProcessor.checkMapJoin(((MapJoinOperator) nd).getConf().getPosBigTable(), - ((MapJoinOperator) nd).getConf().getConds()); - return false; - } - // convert a bucket map join operator to a sorted merge bucket map join - // operator - convertToSMBJoin(mapJoinOp, srcs); - return true; - } - - @Override - public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - boolean convert = convertSMBJoin(nd, stack, procCtx, nodeOutputs); // Throw an error if the user asked for sort merge bucketed mapjoin to be enforced // and sort merge bucketed mapjoin cannot be performed if (!convert && @@ -175,6 +222,9 @@ throw new SemanticException(ErrorMsg.SORTMERGE_MAPJOIN_FAILED.getMsg()); } + if (convert) { + convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext, pGraphContext); + } return null; } @@ -253,167 +303,5 @@ } return smbJop; } - - /** - * Whether this table is eligible for a sort-merge join. - * - * @param pctx parse context - * @param op map join operator being considered - * @param joinTree join tree being considered - * @param alias table alias in the join tree being checked - * @param pos position of the table - * @param sortColumnsFirstTable The names and order of the sorted columns for the first table. - * It is not initialized when pos = 0. - * @return - * @throws SemanticException - */ - private boolean isTableSorted(ParseContext pctx, - MapJoinOperator op, - QBJoinTree joinTree, - int pos, - List sortColumnsFirstTable, - String[] aliases) - throws SemanticException { - String alias = aliases[pos]; - Map topToTable = this.pGraphContext - .getTopToTable(); - - /* - * Consider a query like: - * - * select -- mapjoin(subq1) -- * from - * (select a.key, a.value from tbl1 a) subq1 - * join - * (select a.key, a.value from tbl2 a) subq2 - * on subq1.key = subq2.key; - * - * aliasToOpInfo contains the SelectOperator for subq1 and subq2. - * We need to traverse the tree (using TableAccessAnalyzer) to get to the base - * table. If the object being map-joined is a base table, then aliasToOpInfo - * contains the TableScanOperator, and TableAccessAnalyzer is a no-op. - */ - Operator topOp = joinTree.getAliasToOpInfo().get(alias); - if (topOp == null) { - return false; - } - List joinCols = toColumns(op.getConf().getKeys().get((byte) pos)); - if (joinCols == null || joinCols.isEmpty()) { - return false; - } - TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols); - if (tso == null) { - return false; - } - - // For nested sub-queries, the alias mapping is not maintained in QB currently. - /* - * Consider a query like: - * - * select count(*) from - * ( - * select key, count(*) from - * ( - * select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2 - * from tbl1 a join tbl2 b on a.key = b.key - * ) subq1 - * group by key - * ) subq2; - * - * The table alias should be subq2:subq1:a which needs to be fetched from topOps. - */ - if (pGraphContext.getTopOps().containsValue(tso)) { - for (Map.Entry> topOpEntry : - this.pGraphContext.getTopOps().entrySet()) { - if (topOpEntry.getValue() == tso) { - alias = topOpEntry.getKey(); - aliases[pos] = alias; - break; - } - } - } - else { - // Ideally, this should never happen, and this should be an assert. - return false; - } - - Table tbl = topToTable.get(tso); - - if (tbl.isPartitioned()) { - PrunedPartitionList prunedParts = null; - try { - prunedParts = pGraphContext.getOpToPartList().get(tso); - if (prunedParts == null) { - prunedParts = PartitionPruner.prune(tbl, pGraphContext - .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias, - pGraphContext.getPrunedPartitions()); - pGraphContext.getOpToPartList().put(tso, prunedParts); - } - } catch (HiveException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - throw new SemanticException(e.getMessage(), e); - } - List partitions = prunedParts.getNotDeniedPartns(); - // Populate the names and order of columns for the first partition of the - // first table - if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) { - Partition firstPartition = partitions.get(0); - sortColumnsFirstTable.addAll(firstPartition.getSortCols()); - } - - for (Partition partition : prunedParts.getNotDeniedPartns()) { - if (!checkSortColsAndJoinCols(partition.getSortCols(), - joinCols, - sortColumnsFirstTable)) { - return false; - } - } - return true; - } - - // Populate the names and order of columns for the first table - if (pos == 0) { - sortColumnsFirstTable.addAll(tbl.getSortCols()); - } - - return checkSortColsAndJoinCols(tbl.getSortCols(), - joinCols, - sortColumnsFirstTable); - } - - private boolean checkSortColsAndJoinCols(List sortCols, - List joinCols, - List sortColumnsFirstPartition) { - - if (sortCols == null || sortCols.size() < joinCols.size()) { - return false; - } - - // A join is eligible for a sort-merge join, only if it is eligible for - // a bucketized map join. So, we dont need to check for bucketized map - // join here. We are guaranteed that the join keys contain all the - // bucketized keys (note that the order need not be the same). - List sortColNames = new ArrayList(); - - // The join columns should contain all the sort columns - // The sort columns of all the tables should be in the same order - // compare the column names and the order with the first table/partition. - for (int pos = 0; pos < sortCols.size(); pos++) { - Order o = sortCols.get(pos); - if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) { - return false; - } - sortColNames.add(o.getCol()); - } - - // The column names and order (ascending/descending) matched - // The first 'n' sorted columns should be the same as the joinCols, where - // 'n' is the size of join columns. - // For eg: if the table is sorted by (a,b,c), it is OK to convert if the join is - // on (a), (a,b), or any combination of (a,b,c): - // (a,b,c), (a,c,b), (c,a,b), (c,b,a), (b,c,a), (b,a,c) - // but it is not OK to convert if the join is on (a,c) - return sortColNames.subList(0, joinCols.size()).containsAll(joinCols); - } } - } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.parse.ParseContext; + +/* + * This is a pluggable policy to chose the candidate map-join table for converting a join to a + * sort merge join. The leftmost table is chosen as the join table. + */ +public class LeftSortMergeJoinBigTableMatcher implements SortMergeJoinBigTableMatcher { + public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) { + return 0; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy) @@ -74,12 +74,18 @@ } transformations.add(new SamplePruner()); transformations.add(new MapJoinProcessor()); + boolean bucketMapJoinOptimizer = false; if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) { transformations.add(new BucketMapJoinOptimizer()); - if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) { - transformations.add(new SortedMergeBucketMapJoinOptimizer()); + } + + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) { + if (!bucketMapJoinOptimizer) { + transformations.add(new BucketMapJoinOptimizer()); } + transformations.add(new SortedMergeBucketMapJoinOptimizer()); } + transformations.add(new UnionProcessor()); transformations.add(new JoinReorder()); if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) { Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (working copy) @@ -0,0 +1,146 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.Partition; + +public class BucketJoinOptProcCtx implements NodeProcessorCtx { + private static final Log LOG = + LogFactory.getLog(BucketJoinOptProcCtx.class.getName()); + + private final HiveConf conf; + + // we only convert map joins that follows a root table scan in the same + // mapper. That means there is no reducer between the root table scan and + // mapjoin. + private Set setOfRejectedMapjoins = new HashSet(); + private Set setOfRejectedJoins = new HashSet(); + + // The list of join operators which can be converted to a bucketed map join + private Set setOfConvertedJoins = new HashSet(); + + private Map> aliasToPartitionBucketNumberMapping; + private Map>> aliasToPartitionBucketFileNamesMapping; + private Map> bigTblPartsToBucketFileNames; + private Map bigTblPartsToBucketNumber; + private List joinAliases; + private String baseBigAlias; + private boolean bigTablePartitioned; + + public BucketJoinOptProcCtx(HiveConf conf) { + this.conf = conf; + } + + public HiveConf getConf() { + return conf; + } + + public Set getSetOfRejectedMapjoins() { + return setOfRejectedMapjoins; + } + + public void setSetOfRejectedMapjoins(Set listOfRejectedMapjoins) { + this.setOfRejectedMapjoins = listOfRejectedMapjoins; + } + + public Set getSetOfRejectedJoins() { + return setOfRejectedJoins; + } + + public Set getSetOfConvertedJoins() { + return setOfConvertedJoins; + } + + public void setSetOfRejectedJoins(Set setOfRejectedJoins) { + this.setOfRejectedJoins = setOfRejectedJoins; + } + + public void setSetOfConvertedJoins(Set setOfConvertedJoins) { + this.setOfConvertedJoins = setOfConvertedJoins; + } + + public Map> getAliasToPartitionBucketNumberMapping() { + return aliasToPartitionBucketNumberMapping; + } + + public Map>> getAliasToPartitionBucketFileNamesMapping() { + return aliasToPartitionBucketFileNamesMapping; + } + + public Map> getBigTblPartsToBucketFileNames() { + return bigTblPartsToBucketFileNames; + } + + public Map getBigTblPartsToBucketNumber() { + return bigTblPartsToBucketNumber; + } + + public void setAliasToPartitionBucketNumberMapping( + Map> aliasToPartitionBucketNumberMapping) { + this.aliasToPartitionBucketNumberMapping = aliasToPartitionBucketNumberMapping; + } + + public void setAliasToPartitionBucketFileNamesMapping( + Map>> aliasToPartitionBucketFileNamesMapping) { + this.aliasToPartitionBucketFileNamesMapping = aliasToPartitionBucketFileNamesMapping; + } + + public void setBigTblPartsToBucketFileNames( + Map> bigTblPartsToBucketFileNames) { + this.bigTblPartsToBucketFileNames = bigTblPartsToBucketFileNames; + } + + public void setBigTblPartsToBucketNumber(Map bigTblPartsToBucketNumber) { + this.bigTblPartsToBucketNumber = bigTblPartsToBucketNumber; + } + + public void setJoinAliases(List joinAliases) { + this.joinAliases = joinAliases; + } + + public void setBaseBigAlias(String baseBigAlias) { + this.baseBigAlias = baseBigAlias; + } + + public List getJoinAliases() { + return joinAliases; + } + + public String getBaseBigAlias() { + return baseBigAlias; + } + + public boolean isBigTablePartitioned() { + return bigTablePartitioned; + } + + public void setBigTablePartitioned(boolean bigTablePartitioned) { + this.bigTablePartitioned = bigTablePartitioned; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +public interface SortMergeJoinBigTableMatcher { + public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) + throws SemanticException; +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/CommonSizeSortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/CommonSizeSortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/CommonSizeSortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.List; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/* + * This is a pluggable policy to chose the candidate map-join table for converting a join to a + * sort merge join. The largest table is chosen based on the size of the tables. + */ +public class CommonSizeSortMergeJoinBigTableMatcher { + protected void getListTopOps( + Operator op, List topOps) { + if ((op.getParentOperators() == null) || + (op.getParentOperators().isEmpty())) { + return; + } + + for (Operator parentOp : op.getParentOperators()) { + if (parentOp instanceof TableScanOperator) { + topOps.add((TableScanOperator)parentOp); + } + else { + getListTopOps(parentOp, topOps); + } + } + } + + private long getSize(HiveConf conf, String size, Path path) { + // If the size is present in the metastore, use it + if (size != null) { + try { + return Long.valueOf(size); + } catch (NumberFormatException e) { + return 0; + } + } + + try { + FileSystem fs = path.getFileSystem(conf); + return fs.getContentSummary(path).getLength(); + } catch (Exception e) { + return 0; + } + } + + protected long getSize(HiveConf conf, Table table) { + Path path = table.getPath(); + String size = table.getProperty("totalSize"); + return getSize(conf, size, path); + } + + protected long getSize(HiveConf conf, Partition partition) { + Path path = partition.getPartitionPath(); + String size = partition.getParameters().get("totalSize"); + + return getSize(conf, size, path); + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy) @@ -17,32 +17,17 @@ */ package org.apache.hadoop.hive.ql.optimizer; -import java.io.IOException; -import java.net.URI; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; import java.util.LinkedHashMap; -import java.util.List; import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; -import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; -import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; @@ -53,18 +38,8 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.metadata.Partition; -import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.ParseContext; -import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; -import org.apache.hadoop.hive.ql.parse.QB; -import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; /** * this transformation does bucket map join optimization. @@ -81,8 +56,8 @@ public ParseContext transform(ParseContext pctx) throws SemanticException { Map opRules = new LinkedHashMap(); - BucketMapjoinOptProcCtx bucketMapJoinOptimizeCtx = - new BucketMapjoinOptProcCtx(pctx.getConf()); + BucketJoinOptProcCtx bucketMapJoinOptimizeCtx = + new BucketJoinOptProcCtx(pctx.getConf()); // process map joins with no reducers pattern opRules.put(new RuleRegExp("R1", @@ -116,11 +91,11 @@ return new NodeProcessor() { @Override public Object process(Node nd, Stack stack, - NodeProcessorCtx procCtx, Object... nodeOutputs) + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { MapJoinOperator mapJoinOp = (MapJoinOperator) nd; - BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx; - context.listOfRejectedMapjoins.add(mapJoinOp); + BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx; + context.getSetOfRejectedMapjoins().add(mapJoinOp); return null; } }; @@ -130,6 +105,10 @@ return new BucketMapjoinOptProc(pctx); } + private NodeProcessor getBucketjoinProc(ParseContext pctx) { + return new BucketMapjoinOptProc(pctx); + } + private NodeProcessor getDefaultProc() { return new NodeProcessor() { @Override @@ -143,270 +122,20 @@ class BucketMapjoinOptProc extends AbstractBucketJoinProc implements NodeProcessor { - protected ParseContext pGraphContext; - public BucketMapjoinOptProc(ParseContext pGraphContext) { - super(); - this.pGraphContext = pGraphContext; + super(pGraphContext); } - private boolean convertBucketMapJoin(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - MapJoinOperator mapJoinOp = (MapJoinOperator) nd; - BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx; - HiveConf conf = context.getConf(); - - if (context.getListOfRejectedMapjoins().contains(mapJoinOp)) { - return false; - } - - QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp); - if (joinCxt == null) { - return false; - } - - List joinAliases = new ArrayList(); - String[] srcs = joinCxt.getBaseSrc(); - String[] left = joinCxt.getLeftAliases(); - List mapAlias = joinCxt.getMapAliases(); - String baseBigAlias = null; - - for (String s : left) { - if (s != null) { - String subQueryAlias = QB.getAppendedAliasFromId(joinCxt.getId(), s); - if (!joinAliases.contains(subQueryAlias)) { - joinAliases.add(subQueryAlias); - if(!mapAlias.contains(s)) { - baseBigAlias = subQueryAlias; - } - } - } - } - - for (String s : srcs) { - if (s != null) { - String subQueryAlias = QB.getAppendedAliasFromId(joinCxt.getId(), s); - if (!joinAliases.contains(subQueryAlias)) { - joinAliases.add(subQueryAlias); - if(!mapAlias.contains(s)) { - baseBigAlias = subQueryAlias; - } - } - } - } - - MapJoinDesc mjDesc = mapJoinOp.getConf(); - LinkedHashMap> aliasToPartitionBucketNumberMapping = - new LinkedHashMap>(); - LinkedHashMap>> aliasToPartitionBucketFileNamesMapping = - new LinkedHashMap>>(); - - Map> topOps = - this.pGraphContext.getTopOps(); - Map topToTable = this.pGraphContext.getTopToTable(); - - // (partition to bucket file names) and (partition to bucket number) for - // the big table; - LinkedHashMap> bigTblPartsToBucketFileNames = new LinkedHashMap>(); - LinkedHashMap bigTblPartsToBucketNumber = new LinkedHashMap(); - - Integer[] orders = null; // accessing order of join cols to bucket cols, should be same - boolean bigTablePartitioned = true; - for (int index = 0; index < joinAliases.size(); index++) { - String alias = joinAliases.get(index); - Operator topOp = joinCxt.getAliasToOpInfo().get(alias); - if (topOp == null) { - return false; - } - List keys = toColumns(mjDesc.getKeys().get((byte) index)); - if (keys == null || keys.isEmpty()) { - return false; - } - int oldKeySize = keys.size(); - TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys); - if (tso == null) { - return false; - } - - // For nested sub-queries, the alias mapping is not maintained in QB currently. - if (topOps.containsValue(tso)) { - for (Map.Entry> topOpEntry : topOps.entrySet()) { - if (topOpEntry.getValue() == tso) { - String newAlias = topOpEntry.getKey(); - joinAliases.set(index, newAlias); - if (baseBigAlias.equals(alias)) { - baseBigAlias = newAlias; - } - alias = newAlias; - break; - } - } - } - else { - // Ideally, this should never happen, and this should be an assert. - return false; - } - - // The join keys cannot be transformed in the sub-query currently. - // TableAccessAnalyzer.genRootTableScan will only return the base table scan - // if the join keys are constants or a column. Even a simple cast of the join keys - // will result in a null table scan operator. In case of constant join keys, they would - // be removed, and the size before and after the genRootTableScan will be different. - if (keys.size() != oldKeySize) { - return false; - } - if (orders == null) { - orders = new Integer[keys.size()]; - } - - Table tbl = topToTable.get(tso); - if (tbl.isPartitioned()) { - PrunedPartitionList prunedParts; - try { - prunedParts = pGraphContext.getOpToPartList().get(tso); - if (prunedParts == null) { - prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), - pGraphContext.getConf(), alias, - pGraphContext.getPrunedPartitions()); - pGraphContext.getOpToPartList().put(tso, prunedParts); - } - } catch (HiveException e) { - // Has to use full name to make sure it does not conflict with - // org.apache.commons.lang.StringUtils - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - throw new SemanticException(e.getMessage(), e); - } - List partitions = prunedParts.getNotDeniedPartns(); - // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number) - if (partitions.isEmpty()) { - if (!alias.equals(baseBigAlias)) { - aliasToPartitionBucketNumberMapping.put(alias, Arrays. asList()); - aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>()); - } - } else { - List buckets = new ArrayList(); - List> files = new ArrayList>(); - for (Partition p : partitions) { - if (!checkBucketColumns(p.getBucketCols(), keys, orders)) { - return false; - } - List fileNames = getOnePartitionBucketFileNames(p.getDataLocation()); - // The number of files for the table should be same as number of buckets. - int bucketCount = p.getBucketCount(); - if (fileNames.size() != bucketCount) { - String msg = "The number of buckets for table " + - tbl.getTableName() + " partition " + p.getName() + " is " + - p.getBucketCount() + ", whereas the number of files is " + fileNames.size(); - throw new SemanticException( - ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); - } - if (alias.equals(baseBigAlias)) { - bigTblPartsToBucketFileNames.put(p, fileNames); - bigTblPartsToBucketNumber.put(p, bucketCount); - } else { - files.add(fileNames); - buckets.add(bucketCount); - } - } - if (!alias.equals(baseBigAlias)) { - aliasToPartitionBucketNumberMapping.put(alias, buckets); - aliasToPartitionBucketFileNamesMapping.put(alias, files); - } - } - } else { - if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) { - return false; - } - List fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation()); - Integer num = new Integer(tbl.getNumBuckets()); - // The number of files for the table should be same as number of buckets. - if (fileNames.size() != num) { - String msg = "The number of buckets for table " + - tbl.getTableName() + " is " + tbl.getNumBuckets() + - ", whereas the number of files is " + fileNames.size(); - throw new SemanticException( - ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); - } - if (alias.equals(baseBigAlias)) { - bigTblPartsToBucketFileNames.put(null, fileNames); - bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets()); - bigTablePartitioned = false; - } else { - aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num)); - aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames)); - } - } - } - - // All tables or partitions are bucketed, and their bucket number is - // stored in 'bucketNumbers', we need to check if the number of buckets in - // the big table can be divided by no of buckets in small tables. - for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) { - if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) { - return false; - } - } - - MapJoinDesc desc = mapJoinOp.getConf(); - - Map>> aliasBucketFileNameMapping = - new LinkedHashMap>>(); - - // sort bucket names for the big table - for (List partBucketNames : bigTblPartsToBucketFileNames.values()) { - Collections.sort(partBucketNames); - } - - // go through all small tables and get the mapping from bucket file name - // in the big table to bucket file names in small tables. - for (int j = 0; j < joinAliases.size(); j++) { - String alias = joinAliases.get(j); - if (alias.equals(baseBigAlias)) { - continue; - } - for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) { - Collections.sort(names); - } - List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias); - List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias); - - Map> mapping = new LinkedHashMap>(); - aliasBucketFileNameMapping.put(alias, mapping); - - // for each bucket file in big table, get the corresponding bucket file - // name in the small table. - // more than 1 partition in the big table, do the mapping for each partition - Iterator>> bigTblPartToBucketNames = - bigTblPartsToBucketFileNames.entrySet().iterator(); - Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber - .entrySet().iterator(); - while (bigTblPartToBucketNames.hasNext()) { - assert bigTblPartToBucketNum.hasNext(); - int bigTblBucketNum = bigTblPartToBucketNum.next().getValue(); - List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue(); - fillMapping(smallTblBucketNums, smallTblFilesList, - mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping()); - } - } - desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping); - desc.setBigTableAlias(baseBigAlias); - if (bigTablePartitioned) { - desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames)); - } - // successfully convert to bucket map join - desc.setBucketMapJoin(true); - - return true; - } - - @Override @SuppressWarnings("unchecked") public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx; + MapJoinOperator mapJoinOperator = (MapJoinOperator) nd; - boolean convert = convertBucketMapJoin(nd, stack, procCtx, nodeOutputs); - BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx; + // can the mapjoin present be converted to a bucketed mapjoin + boolean convert = canConvertMapJoinToBucketMapJoin( + mapJoinOperator, pGraphContext, context); HiveConf conf = context.getConf(); // Throw an error if the user asked for bucketed mapjoin to be enforced and @@ -415,119 +144,12 @@ throw new SemanticException(ErrorMsg.BUCKET_MAPJOIN_NOT_POSSIBLE.getMsg()); } - return null; - } - - // convert partition to partition spec string - private Map> convert(Map> mapping) { - Map> converted = new HashMap>(); - for (Map.Entry> entry : mapping.entrySet()) { - converted.put(entry.getKey().getName(), entry.getValue()); + if (convert) { + // convert the mapjoin to a bucketized mapjoin + convertMapJoinToBucketMapJoin(mapJoinOperator, context); } - return converted; - } - // called for each partition of big table and populates mapping for each file in the partition - private void fillMapping( - List smallTblBucketNums, - List> smallTblFilesList, - Map> mapping, - int bigTblBucketNum, List bigTblBucketNameList, - Map bucketFileNameMapping) { - - for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) { - ArrayList resultFileNames = new ArrayList(); - for (int sindex = 0; sindex < smallTblBucketNums.size(); sindex++) { - int smallTblBucketNum = smallTblBucketNums.get(sindex); - List smallTblFileNames = smallTblFilesList.get(sindex); - if (bigTblBucketNum >= smallTblBucketNum) { - // if the big table has more buckets than the current small table, - // use "MOD" to get small table bucket names. For example, if the big - // table has 4 buckets and the small table has 2 buckets, then the - // mapping should be 0->0, 1->1, 2->0, 3->1. - int toAddSmallIndex = bindex % smallTblBucketNum; - resultFileNames.add(smallTblFileNames.get(toAddSmallIndex)); - } else { - int jump = smallTblBucketNum / bigTblBucketNum; - for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) { - resultFileNames.add(smallTblFileNames.get(i)); - } - } - } - String inputBigTBLBucket = bigTblBucketNameList.get(bindex); - mapping.put(inputBigTBLBucket, resultFileNames); - bucketFileNameMapping.put(inputBigTBLBucket, bindex); - } + return null; } - - private boolean checkBucketNumberAgainstBigTable( - Map> aliasToBucketNumber, int bucketNumberInPart) { - for (List bucketNums : aliasToBucketNumber.values()) { - for (int nxt : bucketNums) { - boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0 - : bucketNumberInPart % nxt == 0; - if (!ok) { - return false; - } - } - } - return true; - } - - private List getOnePartitionBucketFileNames(URI location) - throws SemanticException { - List fileNames = new ArrayList(); - try { - FileSystem fs = FileSystem.get(location, this.pGraphContext.getConf()); - FileStatus[] files = fs.listStatus(new Path(location.toString())); - if (files != null) { - for (FileStatus file : files) { - fileNames.add(file.getPath().toString()); - } - } - } catch (IOException e) { - throw new SemanticException(e); - } - return fileNames; - } - - private boolean checkBucketColumns(List bucketColumns, List keys, - Integer[] orders) { - if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) { - return false; - } - for (int i = 0; i < keys.size(); i++) { - int index = bucketColumns.indexOf(keys.get(i)); - if (orders[i] != null && orders[i] != index) { - return false; - } - orders[i] = index; - } - // Check if the join columns contains all bucket columns. - // If a table is bucketized on column B, but the join key is A and B, - // it is easy to see joining on different buckets yield empty results. - return keys.containsAll(bucketColumns); - } } - - class BucketMapjoinOptProcCtx implements NodeProcessorCtx { - private final HiveConf conf; - - // we only convert map joins that follows a root table scan in the same - // mapper. That means there is no reducer between the root table scan and - // mapjoin. - Set listOfRejectedMapjoins = new HashSet(); - - public BucketMapjoinOptProcCtx(HiveConf conf) { - this.conf = conf; - } - - public HiveConf getConf() { - return conf; - } - - public Set getListOfRejectedMapjoins() { - return listOfRejectedMapjoins; - } - } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (working copy) @@ -17,34 +17,428 @@ */ package org.apache.hadoop.hive.ql.optimizer; +import java.io.IOException; +import java.net.URI; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.QB; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer; /** * this transformation does bucket map join optimization. */ abstract public class AbstractBucketJoinProc implements NodeProcessor { + private static final Log LOG = + LogFactory.getLog(AbstractBucketJoinProc.class.getName()); - private static final Log LOG = LogFactory.getLog(AbstractBucketJoinProc.class.getName()); + protected ParseContext pGraphContext; + public AbstractBucketJoinProc(ParseContext pGraphContext) { + this.pGraphContext = pGraphContext; + } + public AbstractBucketJoinProc() { } @Override abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException; + Object... nodeOutputs) throws SemanticException; - public List toColumns(List keys) { + private static List getOnePartitionBucketFileNames( + URI location, ParseContext pGraphContext) throws SemanticException { + List fileNames = new ArrayList(); + try { + FileSystem fs = FileSystem.get(location, pGraphContext.getConf()); + FileStatus[] files = fs.listStatus(new Path(location.toString())); + if (files != null) { + for (FileStatus file : files) { + fileNames.add(file.getPath().toString()); + } + } + } catch (IOException e) { + throw new SemanticException(e); + } + return fileNames; + } + + private boolean checkBucketColumns(List bucketColumns, + List keys, + Integer[] orders) { + if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) { + return false; + } + for (int i = 0; i < keys.size(); i++) { + int index = bucketColumns.indexOf(keys.get(i)); + if (orders[i] != null && orders[i] != index) { + return false; + } + orders[i] = index; + } + + // Check if the join columns contains all bucket columns. + // If a table is bucketized on column B, but the join key is A and B, + // it is easy to see joining on different buckets yield empty results. + return keys.containsAll(bucketColumns); + } + + private boolean checkBucketNumberAgainstBigTable( + Map> aliasToBucketNumber, int bucketNumberInPart) { + for (List bucketNums : aliasToBucketNumber.values()) { + for (int nxt : bucketNums) { + boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0 + : bucketNumberInPart % nxt == 0; + if (!ok) { + return false; + } + } + } + return true; + } + + protected boolean canConvertMapJoinToBucketMapJoin( + MapJoinOperator mapJoinOp, + ParseContext pGraphContext, + BucketJoinOptProcCtx context) throws SemanticException { + + if (context.getSetOfRejectedMapjoins().contains(mapJoinOp)) { + return false; + } + + QBJoinTree joinCtx = this.pGraphContext.getMapJoinContext().get(mapJoinOp); + if (joinCtx == null) { + return false; + } + + List joinAliases = new ArrayList(); + String[] srcs = joinCtx.getBaseSrc(); + String[] left = joinCtx.getLeftAliases(); + List mapAlias = joinCtx.getMapAliases(); + String baseBigAlias = null; + + for (String s : left) { + if (s != null) { + String subQueryAlias = QB.getAppendedAliasFromId(joinCtx.getId(), s); + if (!joinAliases.contains(subQueryAlias)) { + joinAliases.add(subQueryAlias); + if (!mapAlias.contains(s)) { + baseBigAlias = subQueryAlias; + } + } + } + } + + for (String s : srcs) { + if (s != null) { + String subQueryAlias = QB.getAppendedAliasFromId(joinCtx.getId(), s); + if (!joinAliases.contains(subQueryAlias)) { + joinAliases.add(subQueryAlias); + if (!mapAlias.contains(s)) { + baseBigAlias = subQueryAlias; + } + } + } + } + + Map> keysMap = mapJoinOp.getConf().getKeys(); + + return checkConvertBucketMapJoin( + pGraphContext, + context, + joinCtx, + keysMap, + baseBigAlias, + joinAliases); + } + + protected boolean checkConvertBucketMapJoin( + ParseContext pGraphContext, + BucketJoinOptProcCtx context, + QBJoinTree joinCtx, + Map> keysMap, + String baseBigAlias, + List joinAliases) throws SemanticException { + + LinkedHashMap> aliasToPartitionBucketNumberMapping = + new LinkedHashMap>(); + LinkedHashMap>> aliasToPartitionBucketFileNamesMapping = + new LinkedHashMap>>(); + + HashMap> topOps = pGraphContext.getTopOps(); + Map topToTable = pGraphContext.getTopToTable(); + + // (partition to bucket file names) and (partition to bucket number) for + // the big table; + LinkedHashMap> bigTblPartsToBucketFileNames = + new LinkedHashMap>(); + LinkedHashMap bigTblPartsToBucketNumber = + new LinkedHashMap(); + + Integer[] orders = null; // accessing order of join cols to bucket cols, should be same + boolean bigTablePartitioned = true; + for (int index = 0; index < joinAliases.size(); index++) { + String alias = joinAliases.get(index); + Operator topOp = joinCtx.getAliasToOpInfo().get(alias); + if (topOp == null) { + return false; + } + List keys = toColumns(keysMap.get((byte) index)); + if (keys == null || keys.isEmpty()) { + return false; + } + int oldKeySize = keys.size(); + TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys); + if (tso == null) { + return false; + } + + // For nested sub-queries, the alias mapping is not maintained in QB currently. + if (topOps.containsValue(tso)) { + for (Map.Entry> topOpEntry : topOps.entrySet()) { + if (topOpEntry.getValue() == tso) { + String newAlias = topOpEntry.getKey(); + joinAliases.set(index, newAlias); + if (baseBigAlias.equals(alias)) { + baseBigAlias = newAlias; + } + alias = newAlias; + break; + } + } + } + else { + // Ideally, this should never happen, and this should be an assert. + return false; + } + + // The join keys cannot be transformed in the sub-query currently. + // TableAccessAnalyzer.genRootTableScan will only return the base table scan + // if the join keys are constants or a column. Even a simple cast of the join keys + // will result in a null table scan operator. In case of constant join keys, they would + // be removed, and the size before and after the genRootTableScan will be different. + if (keys.size() != oldKeySize) { + return false; + } + + if (orders == null) { + orders = new Integer[keys.size()]; + } + + Table tbl = topToTable.get(tso); + if (tbl.isPartitioned()) { + PrunedPartitionList prunedParts; + try { + prunedParts = pGraphContext.getOpToPartList().get(tso); + if (prunedParts == null) { + prunedParts = + PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), + pGraphContext.getConf(), alias, + pGraphContext.getPrunedPartitions()); + pGraphContext.getOpToPartList().put(tso, prunedParts); + } + } catch (HiveException e) { + // Has to use full name to make sure it does not conflict with + // org.apache.commons.lang.StringUtils + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + throw new SemanticException(e.getMessage(), e); + } + List partitions = prunedParts.getNotDeniedPartns(); + // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number) + if (partitions.isEmpty()) { + if (!alias.equals(baseBigAlias)) { + aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList()); + aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>()); + } + } else { + List buckets = new ArrayList(); + List> files = new ArrayList>(); + for (Partition p : partitions) { + if (!checkBucketColumns(p.getBucketCols(), keys, orders)) { + return false; + } + List fileNames = + getOnePartitionBucketFileNames(p.getDataLocation(), pGraphContext); + // The number of files for the table should be same as number of buckets. + int bucketCount = p.getBucketCount(); + + if (fileNames.size() != bucketCount) { + String msg = "The number of buckets for table " + + tbl.getTableName() + " partition " + p.getName() + " is " + + p.getBucketCount() + ", whereas the number of files is " + fileNames.size(); + throw new SemanticException( + ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); + } + + if (alias.equals(baseBigAlias)) { + bigTblPartsToBucketFileNames.put(p, fileNames); + bigTblPartsToBucketNumber.put(p, bucketCount); + } else { + files.add(fileNames); + buckets.add(bucketCount); + } + } + if (!alias.equals(baseBigAlias)) { + aliasToPartitionBucketNumberMapping.put(alias, buckets); + aliasToPartitionBucketFileNamesMapping.put(alias, files); + } + } + } else { + if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) { + return false; + } + List fileNames = + getOnePartitionBucketFileNames(tbl.getDataLocation(), pGraphContext); + Integer num = new Integer(tbl.getNumBuckets()); + + // The number of files for the table should be same as number of buckets. + if (fileNames.size() != num) { + String msg = "The number of buckets for table " + + tbl.getTableName() + " is " + tbl.getNumBuckets() + + ", whereas the number of files is " + fileNames.size(); + throw new SemanticException( + ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); + } + + if (alias.equals(baseBigAlias)) { + bigTblPartsToBucketFileNames.put(null, fileNames); + bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets()); + bigTablePartitioned = false; + } else { + aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num)); + aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames)); + } + } + } + + // All tables or partitions are bucketed, and their bucket number is + // stored in 'bucketNumbers', we need to check if the number of buckets in + // the big table can be divided by no of buckets in small tables. + for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) { + if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) { + return false; + } + } + + context.setAliasToPartitionBucketNumberMapping(aliasToPartitionBucketNumberMapping); + context.setAliasToPartitionBucketFileNamesMapping(aliasToPartitionBucketFileNamesMapping); + context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames); + context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber); + context.setJoinAliases(joinAliases); + context.setBaseBigAlias(baseBigAlias); + context.setBigTablePartitioned(bigTablePartitioned); + + return true; + } + + protected void convertMapJoinToBucketMapJoin( + MapJoinOperator mapJoinOp, + BucketJoinOptProcCtx context) throws SemanticException { + MapJoinDesc desc = mapJoinOp.getConf(); + + Map>> aliasBucketFileNameMapping = + new LinkedHashMap>>(); + + Map> aliasToPartitionBucketNumberMapping = + context.getAliasToPartitionBucketNumberMapping(); + + Map>> aliasToPartitionBucketFileNamesMapping = + context.getAliasToPartitionBucketFileNamesMapping(); + + Map> bigTblPartsToBucketFileNames = + context.getBigTblPartsToBucketFileNames(); + + Map bigTblPartsToBucketNumber = + context.getBigTblPartsToBucketNumber(); + + List joinAliases = context.getJoinAliases(); + String baseBigAlias = context.getBaseBigAlias(); + + // sort bucket names for the big table + for (List partBucketNames : bigTblPartsToBucketFileNames.values()) { + Collections.sort(partBucketNames); + } + + // go through all small tables and get the mapping from bucket file name + // in the big table to bucket file names in small tables. + for (int j = 0; j < joinAliases.size(); j++) { + String alias = joinAliases.get(j); + if (alias.equals(baseBigAlias)) { + continue; + } + for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) { + Collections.sort(names); + } + List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias); + List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias); + + Map> mapping = new LinkedHashMap>(); + aliasBucketFileNameMapping.put(alias, mapping); + + // for each bucket file in big table, get the corresponding bucket file + // name in the small table. + // more than 1 partition in the big table, do the mapping for each partition + Iterator>> bigTblPartToBucketNames = + bigTblPartsToBucketFileNames.entrySet().iterator(); + Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber + .entrySet().iterator(); + while (bigTblPartToBucketNames.hasNext()) { + assert bigTblPartToBucketNum.hasNext(); + int bigTblBucketNum = bigTblPartToBucketNum.next().getValue(); + List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue(); + fillMapping(smallTblBucketNums, smallTblFilesList, + mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping()); + } + } + desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping); + desc.setBigTableAlias(baseBigAlias); + boolean bigTablePartitioned = context.isBigTablePartitioned(); + if (bigTablePartitioned) { + desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames)); + } + } + + // convert partition to partition spec string + private static Map> convert(Map> mapping) { + Map> converted = new HashMap>(); + for (Map.Entry> entry : mapping.entrySet()) { + converted.put(entry.getKey().getName(), entry.getValue()); + } + return converted; + } + + private List toColumns(List keys) { List columns = new ArrayList(); for (ExprNodeDesc key : keys) { if (!(key instanceof ExprNodeColumnDesc)) { @@ -54,4 +448,37 @@ } return columns; } + + // called for each partition of big table and populates mapping for each file in the partition + private static void fillMapping( + List smallTblBucketNums, + List> smallTblFilesList, + Map> mapping, + int bigTblBucketNum, List bigTblBucketNameList, + Map bucketFileNameMapping) { + + for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) { + ArrayList resultFileNames = new ArrayList(); + for (int sindex = 0 ; sindex < smallTblBucketNums.size(); sindex++) { + int smallTblBucketNum = smallTblBucketNums.get(sindex); + List smallTblFileNames = smallTblFilesList.get(sindex); + if (bigTblBucketNum >= smallTblBucketNum) { + // if the big table has more buckets than the current small table, + // use "MOD" to get small table bucket names. For example, if the big + // table has 4 buckets and the small table has 2 buckets, then the + // mapping should be 0->0, 1->1, 2->0, 3->1. + int toAddSmallIndex = bindex % smallTblBucketNum; + resultFileNames.add(smallTblFileNames.get(toAddSmallIndex)); + } else { + int jump = smallTblBucketNum / bigTblBucketNum; + for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) { + resultFileNames.add(smallTblFileNames.get(i)); + } + } + } + String inputBigTBLBucket = bigTblBucketNameList.get(bindex); + mapping.put(inputBigTBLBucket, resultFileNames); + bucketFileNameMapping.put(inputBigTBLBucket, bindex); + } + } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/* + * This is a pluggable policy to chose the candidate map-join table for converting a join to a + * sort merge join. The largest table is chosen based on the size of the tables. + */ +public class SizeSortMergeJoinBigTableMatcher extends CommonSizeSortMergeJoinBigTableMatcher +implements SortMergeJoinBigTableMatcher { + public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) + throws SemanticException { + int bigTablePos = 0; + long maxSize = 0; + HiveConf conf = parseCtx.getConf(); + + try { + List topOps = new ArrayList(); + getListTopOps(joinOp, topOps); + int currentPos = 0; + for (TableScanOperator topOp : topOps) { + Table table = parseCtx.getTopToTable().get(topOp); + long currentSize = 0; + + if (!table.isPartitioned()) { + currentSize = getSize(conf, table); + } + else { + // For partitioned tables, get the size of all the partitions + PrunedPartitionList partsList = + PartitionPruner.prune(parseCtx.getTopToTable().get(topOp), + parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(), + null, parseCtx.getPrunedPartitions()); + for (Partition part : partsList.getNotDeniedPartns()) { + currentSize += getSize(conf, part); + } + } + + if (currentSize > maxSize) { + maxSize = currentSize; + bigTablePos = currentPos; + } + currentPos++; + } + } catch (HiveException e) { + throw new SemanticException(e.getMessage()); + } + + return bigTablePos; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (working copy) @@ -76,6 +76,7 @@ public static final int CONVERTED_LOCAL_MAPJOIN = 3; public static final int BACKUP_COMMON_JOIN = 4; public static final int LOCAL_MAPJOIN=5; + public static final int CONVERTED_SORTMERGEJOIN = 6; // Descendants tasks who subscribe feeds from this task Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy) @@ -172,6 +172,11 @@ } @Override + public boolean supportAutomaticSortMergeJoin() { + return true; + } + + @Override public boolean supportUnionRemoveOptimization() { return true; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (working copy) @@ -76,6 +76,11 @@ private transient boolean inputFileChanged = false; transient boolean localWorkInited = false; + // This join has been converted to a SMB join by the hive optimizer. The user did not + // give a mapjoin hint in the query. The hive optimizer figured out that the join can be + // performed as a smb join, based on all the tables/partitions being joined. + private transient boolean convertedAutomaticallySMBJoin = false; + public SMBMapJoinOperator() { } @@ -622,6 +627,14 @@ return OperatorType.MAPJOIN; } + public boolean isConvertedAutomaticallySMBJoin() { + return convertedAutomaticallySMBJoin; + } + + public void setConvertedAutomaticallySMBJoin(boolean convertedAutomaticallySMBJoin) { + this.convertedAutomaticallySMBJoin = convertedAutomaticallySMBJoin; + } + // returns rows from possibly multiple bucket files of small table in ascending order // by utilizing primary queue (borrowed from hadoop) // elements of queue (Integer) are index to FetchOperator[] (segments) Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy) @@ -112,6 +112,11 @@ } @Override + public boolean supportAutomaticSortMergeJoin() { + return true; + } + + @Override public boolean supportUnionRemoveOptimization() { return true; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy) @@ -295,4 +295,9 @@ public boolean supportSkewJoinOptimization() { return true; } + + @Override + public boolean supportAutomaticSortMergeJoin() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -1426,6 +1426,15 @@ this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat; } + /** + * Whether this operator supports automatic sort merge join. + * The stack is traversed, and this method is invoked for all the operators. + * @return TRUE if yes, FALSE otherwise. + */ + public boolean supportAutomaticSortMergeJoin() { + return false; + } + public boolean supportUnionRemoveOptimization() { return false; } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java (working copy) @@ -31,7 +31,7 @@ private MapredLocalWork localWork; - //keep a mapping from tag to the fetch operator alias + // keep a mapping from tag to the fetch operator alias private HashMap tagToAlias; private Map aliasToSink; Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -71,6 +71,7 @@ import org.apache.hadoop.hive.ql.exec.RecordWriter; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.StatsTask; import org.apache.hadoop.hive.ql.exec.TableScanOperator; @@ -195,6 +196,7 @@ private List loadTableWork; private List loadFileWork; private Map joinContext; + private Map smbMapJoinContext; private final HashMap topToTable; private QB qb; private ASTNode ast; @@ -242,6 +244,7 @@ loadFileWork = new ArrayList(); opParseCtx = new LinkedHashMap, OpParseContext>(); joinContext = new HashMap(); + smbMapJoinContext = new HashMap(); topToTable = new HashMap(); destTableId = 1; uCtx = null; @@ -270,6 +273,7 @@ ast = null; uCtx = null; joinContext.clear(); + smbMapJoinContext.clear(); opParseCtx.clear(); groupOpToInputTables.clear(); prunedPartitions.clear(); @@ -285,6 +289,7 @@ loadTableWork = pctx.getLoadTableWork(); loadFileWork = pctx.getLoadFileWork(); joinContext = pctx.getJoinContext(); + smbMapJoinContext = pctx.getSmbMapJoinContext(); ctx = pctx.getContext(); destTableId = pctx.getDestTableId(); idToTableNameMap = pctx.getIdToTableNameMap(); @@ -299,7 +304,7 @@ public ParseContext getParseContext() { return new ParseContext(conf, qb, ast, opToPartPruner, opToPartList, topOps, - topSelOps, opParseCtx, joinContext, topToTable, loadTableWork, + topSelOps, opParseCtx, joinContext, smbMapJoinContext, topToTable, loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, @@ -8449,7 +8454,8 @@ } ParseContext pCtx = new ParseContext(conf, qb, child, opToPartPruner, - opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable, + opToPartList, topOps, topSelOps, opParseCtx, joinContext, smbMapJoinContext, + topToTable, loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (revision 1431968) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (working copy) @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.hooks.LineageInfo; @@ -71,6 +72,7 @@ private LinkedHashMap, OpParseContext> opParseCtx; private Map joinContext; private Map mapJoinContext; + private Map smbMapJoinContext; private HashMap topToTable; private HashMap nameToSplitSample; private List loadTableWork; @@ -159,6 +161,7 @@ HashMap> topSelOps, LinkedHashMap, OpParseContext> opParseCtx, Map joinContext, + Map smbMapJoinContext, HashMap topToTable, List loadTableWork, List loadFileWork, Context ctx, HashMap idToTableNameMap, int destTableId, @@ -176,6 +179,7 @@ this.opToPartPruner = opToPartPruner; this.opToPartList = opToPartList; this.joinContext = joinContext; + this.smbMapJoinContext = smbMapJoinContext; this.topToTable = topToTable; this.loadFileWork = loadFileWork; this.loadTableWork = loadTableWork; @@ -525,6 +529,14 @@ this.mapJoinContext = mapJoinContext; } + public Map getSmbMapJoinContext() { + return smbMapJoinContext; + } + + public void setSmbMapJoinContext(Map smbMapJoinContext) { + this.smbMapJoinContext = smbMapJoinContext; + } + public GlobalLimitCtx getGlobalLimitCtx() { return globalLimitCtx; }