Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1406918) +++ conf/hive-default.xml.template (working copy) @@ -878,6 +878,24 @@ + hive.auto.sort.merge.join + false + Will the join be automatically converted to a sort-merge join, if the joined tables pass + the criteria for sort-merge join. + + + + + hive.auto.sort.merge.join.bigtbl.matcher + org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher + The policy to choose the big table for automatic conversion to sort-merge join. + By default, the leftmost table is assigned the big table. Other policies are based on size: + org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher. + New policies can be added in future. + + + + hive.metastore.ds.connection.url.hook Name of the hook to use for retriving the JDO connection URL. If empty, the value in javax.jdo.option.ConnectionURL is used Index: metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java =================================================================== --- metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java (revision 1406918) +++ metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java (working copy) @@ -469,6 +469,10 @@ return startFunction(function, " : db=" + db + " tbl=" + tbl + "partition=" + partName); } + public void endFunction(String function, boolean successful) { + endFunction(function, successful, null); + } + public void endFunction(String function, boolean successful, Exception e) { endFunction(function, new MetaStoreEndFunctionContext(successful, e)); } Index: data/files/smallsrcsortbucket4outof4.txt =================================================================== --- data/files/smallsrcsortbucket4outof4.txt (revision 0) +++ data/files/smallsrcsortbucket4outof4.txt (working copy) @@ -0,0 +1,5 @@ +146val_146 +193val_193 +432val_432 +65val_65 +83val_83 Index: data/files/smallsrcsortbucket1outof4.txt =================================================================== --- data/files/smallsrcsortbucket1outof4.txt (revision 0) +++ data/files/smallsrcsortbucket1outof4.txt (working copy) @@ -0,0 +1,5 @@ +0val_0 +103val_103 +169val_169 +172val_172 +374val_374 Index: data/files/smallsrcsortbucket2outof4.txt =================================================================== --- data/files/smallsrcsortbucket2outof4.txt (revision 0) +++ data/files/smallsrcsortbucket2outof4.txt (working copy) @@ -0,0 +1,5 @@ +180val_180 +221val_221 +379val_379 +478val_478 +74val_74 Index: data/files/smallsrcsortbucket3outof4.txt =================================================================== --- data/files/smallsrcsortbucket3outof4.txt (revision 0) +++ data/files/smallsrcsortbucket3outof4.txt (working copy) @@ -0,0 +1,5 @@ +233val_233 +424val_424 +468val_468 +53val_53 +97val_97 Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1406918) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -491,6 +491,10 @@ HIVEENFORCESORTMERGEBUCKETMAPJOIN("hive.enforce.sortmergebucketmapjoin", false), HIVEENFORCEBUCKETMAPJOIN("hive.enforce.bucketmapjoin", false), + HIVE_AUTO_SORT_MERGE_JOIN("hive.auto.sort.merge.join", false), + HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER("hive.auto.sort.merge.join.bigtbl.matcher", + "org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher"), + HIVESCRIPTOPERATORTRUST("hive.exec.script.trust", false), HIVEROWOFFSET("hive.exec.rowoffset", false), Index: ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_6.q.out (working copy) @@ -0,0 +1,730 @@ +PREHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small no part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out (working copy) @@ -0,0 +1,753 @@ +PREHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket + +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket + +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out (working copy) @@ -0,0 +1,550 @@ +PREHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small no part, 4 bucket & big no part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: bucket_big + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 0 + numRows 0 + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_small +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_small +#### A masked pattern was here #### +19 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_9.q.out (working copy) @@ -0,0 +1,727 @@ +PREHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: INSERT OVERWRITE TABLE bucket_big PARTITION (ds = '2008-04-08') +SELECT * FROM +( + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src +) subq +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: INSERT OVERWRITE TABLE bucket_big PARTITION (ds = '2008-04-08') +SELECT * FROM +( + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src +) subq +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: Lineage: bucket_big PARTITION(ds=2008-04-08).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucket_big PARTITION(ds=2008-04-08).value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: bucket_big PARTITION(ds=2008-04-08).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucket_big PARTITION(ds=2008-04-08).value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 6000 + partition_columns ds + rawDataSize 63744 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 69744 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 6000 + partition_columns ds + rawDataSize 63744 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 69744 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 6000 + partition_columns ds + rawDataSize 63744 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 69744 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 6000 + partition_columns ds + rawDataSize 63744 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 69744 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 6000 + partition_columns ds + rawDataSize 63744 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 69744 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 6000 + partition_columns ds + rawDataSize 63744 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 69744 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: Lineage: bucket_big PARTITION(ds=2008-04-08).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: bucket_big PARTITION(ds=2008-04-08).value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), (src)src.FieldSchema(name:value, type:string, comment:default), ] +456 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out (working copy) @@ -0,0 +1,698 @@ +PREHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out (working copy) @@ -0,0 +1,865 @@ +PREHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11624 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out (working copy) @@ -0,0 +1,686 @@ +PREHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 228 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +38 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out (working copy) @@ -0,0 +1,865 @@ +PREHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + /bucket_small/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 8 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 452 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +76 Index: ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out =================================================================== --- ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out (working copy) @@ -0,0 +1,1415 @@ +PREHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_big) a) (TOK_TABREF (TOK_TABNAME bucket_small) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [a] + /bucket_big/ds=2008-04-09 [a] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 +PREHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-6 is a root stage , consists of Stage-7, Stage-8, Stage-1 + Stage-7 has a backup stage: Stage-1 + Stage-4 depends on stages: Stage-7 + Stage-2 depends on stages: Stage-1, Stage-4, Stage-5 + Stage-8 has a backup stage: Stage-1 + Stage-5 depends on stages: Stage-8 + Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_small/ds=2008-04-08 [a] + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 4 + numPartitions 1 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 226 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 2 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numPartitions 2 + numRows 0 + partition_columns ds + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5500 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [b] + /bucket_big/ds=2008-04-09 [b] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +38 Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_3.q (working copy) @@ -0,0 +1,23 @@ +-- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_7.q (working copy) @@ -0,0 +1,28 @@ +-- small 2 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_4.q (working copy) @@ -0,0 +1,25 @@ +-- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_8.q (working copy) @@ -0,0 +1,28 @@ +-- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_1.q (working copy) @@ -0,0 +1,26 @@ +-- small 1 part, 2 bucket & big 2 part, 4 bucket + +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_5.q (working copy) @@ -0,0 +1,21 @@ +-- small no part, 4 bucket & big no part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small; + +CREATE TABLE bucket_big (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big; +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big; + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_9.q (working copy) @@ -0,0 +1,54 @@ +-- small 2 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; + +set hive.enforce.bucketing=true; +set hive.enforce.sorting=true; +set hive.exec.reducers.max = 1; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +INSERT OVERWRITE TABLE bucket_big PARTITION (ds = '2008-04-08') +SELECT * FROM +( + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src + union all + select * from src +) subq; + + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.mapjoin.smalltable.filesize=50; +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_2.q (working copy) @@ -0,0 +1,29 @@ +-- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.LeftSortMergeJoinBigTableMatcher; + +-- Since the leftmost table is assumed as the big table, arrange the tables in the join accordingly +explain extended select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; +select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q =================================================================== --- ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (revision 0) +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_6.q (working copy) @@ -0,0 +1,23 @@ +-- small no part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/smallsrcsortbucket1outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket2outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket3outof4.txt' INTO TABLE bucket_small; +load data local inpath '../data/files/smallsrcsortbucket4outof4.txt' INTO TABLE bucket_small; + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.auto.sort.merge.join=true; +set hive.optimize.bucketmapjoin = true; +set hive.optimize.bucketmapjoin.sortedmerge = true; + +set hive.auto.sort.merge.join.bigtbl.matcher = org.apache.hadoop.hive.ql.optimizer.SizeSortMergeJoinBigTableMatcher; + +-- Since size is being used to find the big table, the order of the tables in the join does not matter +explain extended select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AvgPartitionSizeSortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,148 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/* + * This is a pluggable policy to chose the candidate map-join table for converting a join to a + * sort merge join. The largest table is chosen based on the size of the tables. + */ +public class AvgPartitionSizeSortMergeJoinBigTableMatcher + implements SortMergeJoinBigTableMatcher { + private void getListTopOps( + Operator op, List topOps) { + if ((op.getParentOperators() == null) || + (op.getParentOperators().isEmpty())) { + return; + } + + for (Operator parentOp : op.getParentOperators()) { + if (parentOp instanceof TableScanOperator) { + topOps.add((TableScanOperator)parentOp); + } + else { + getListTopOps(parentOp, topOps); + } + } + } + + private long getSize(HiveConf conf, String size, Path path) { + // If the size is present in the metastore, use it + if (size != null) { + try { + return Long.valueOf(size); + } catch (NumberFormatException e) { + return 0; + } + } + + try { + FileSystem fs = path.getFileSystem(conf); + return fs.getContentSummary(path).getLength(); + } catch (Exception e) { + return 0; + } + } + + private long getSize(HiveConf conf, Table table) { + Path path = table.getPath(); + String size = table.getProperty("totalSize"); + return getSize(conf, size, path); + } + + private long getSize(HiveConf conf, Partition partition) { + Path path = partition.getPartitionPath(); + String size = partition.getParameters().get("totalSize"); + + return getSize(conf, size, path); + } + + public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) + throws SemanticException { + int bigTablePos = 0; + long maxSize = 0; + int numPartitionsCurrentBigTable = 0; // number of partitions for the chosen big table + HiveConf conf = parseCtx.getConf(); + + try { + List topOps = new ArrayList(); + getListTopOps(joinOp, topOps); + int currentPos = 0; + for (TableScanOperator topOp : topOps) { + int numPartitions = 1; // in case the sizes match, preference is + // given to the table with fewer partitions + Table table = parseCtx.getTopToTable().get(topOp); + long averageSize = 0; + + if (!table.isPartitioned()) { + averageSize = getSize(conf, table); + } + else { + // For partitioned tables, get the size of all the partitions + PrunedPartitionList partsList = + PartitionPruner.prune(parseCtx.getTopToTable().get(topOp), + parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(), + null, parseCtx.getPrunedPartitions()); + numPartitions = partsList.getNotDeniedPartns().size(); + long totalSize = 0; + for (Partition part : partsList.getNotDeniedPartns()) { + totalSize += getSize(conf, part); + } + averageSize = totalSize/numPartitions; + } + + if (averageSize > maxSize) { + maxSize = averageSize; + bigTablePos = currentPos; + numPartitionsCurrentBigTable = numPartitions; + } + // If the sizes match, prefer the table with fewer partitions + else if (averageSize == maxSize) { + if (numPartitions < numPartitionsCurrentBigTable) { + bigTablePos = currentPos; + numPartitionsCurrentBigTable = numPartitions; + } + } + + currentPos++; + } + } catch (HiveException e) { + throw new SemanticException(e.getMessage()); + } + + return bigTablePos; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java (working copy) @@ -0,0 +1,433 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.Order; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.util.ReflectionUtils; + +//try to replace a bucket map join with a sorted merge map join +abstract public class AbstractSMBJoinProc extends AbstractBucketJoinProc implements NodeProcessor { + + private static final Log LOG = LogFactory + .getLog(SortedMergeBucketMapJoinOptimizer.class.getName()); + + public AbstractSMBJoinProc(ParseContext pctx) { + super(pctx); + } + + public AbstractSMBJoinProc() { + super(); + } + + @Override + abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException; + + // Return true or false based on whether the mapjoin was converted successfully to + // a sort-merge map join operator. + protected boolean canConvertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, + Stack stack, + SortBucketJoinOptProcCtx smbJoinContext, + Object... nodeOutputs) throws SemanticException { + + if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null + || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) { + return false; + } + + boolean tableSorted = true; + QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext() + .get(mapJoinOp); + if (joinCxt == null) { + return false; + } + String[] srcs = joinCxt.getBaseSrc(); + int pos = 0; + + // All the tables/partitions columns should be sorted in the same order + // For example, if tables A and B are being joined on columns c1, c2 and c3 + // which are the sorted and bucketed columns. The join would work, as long + // c1, c2 and c3 are sorted in the same order. + List sortColumnsFirstTable = new ArrayList(); + + for (String src : srcs) { + tableSorted = tableSorted + && isTableSorted(smbJoinContext, + pGraphContext, + mapJoinOp.getConf().getKeys().get((byte) pos), + joinCxt, + src, + pos, + sortColumnsFirstTable); + pos++; + } + if (!tableSorted) { + // this is a mapjoin but not suited for a sort merge bucket map join. check outer joins + MapJoinProcessor.checkMapJoin(mapJoinOp.getConf().getPosBigTable(), + mapJoinOp.getConf().getConds()); + return false; + } + + smbJoinContext.setSrcs(srcs); + return true; + } + + + // Convert the bucket map-join operator to a sort-merge map join operator + protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext parseContext) { + + String[] srcs = smbJoinContext.getSrcs(); + SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp); + SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf()); + smbJop.setConf(smbJoinDesc); + HashMap tagToAlias = new HashMap(); + for (int i = 0; i < srcs.length; i++) { + tagToAlias.put((byte) i, srcs[i]); + } + smbJoinDesc.setTagToAlias(tagToAlias); + + int indexInListMapJoinNoReducer = + this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp); + if (indexInListMapJoinNoReducer >= 0 ) { + this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer); + this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop); + } + + List> parentOperators = mapJoinOp.getParentOperators(); + for (int i = 0; i < parentOperators.size(); i++) { + Operator par = parentOperators.get(i); + int index = par.getChildOperators().indexOf(mapJoinOp); + par.getChildOperators().remove(index); + par.getChildOperators().add(index, smbJop); + } + List> childOps = mapJoinOp.getChildOperators(); + for (int i = 0; i < childOps.size(); i++) { + Operator child = childOps.get(i); + int index = child.getParentOperators().indexOf(mapJoinOp); + child.getParentOperators().remove(index); + child.getParentOperators().add(index, smbJop); + } + parseContext.getSmbMapJoinContext().put(smbJop, + parseContext.getMapJoinContext().get(mapJoinOp)); + parseContext.getMapJoinContext().remove(mapJoinOp); + parseContext.getOpParseCtx().put(smbJop, parseContext.getOpParseCtx().get(mapJoinOp)); + + return smbJop; + } + + /** + * Whether this table is eligible for a sort-merge join. + * + * @param pctx parse context + * @param op map join operator being considered + * @param joinTree join tree being considered + * @param alias table alias in the join tree being checked + * @param pos position of the table + * @param sortColumnsFirstTable The names and order of the sorted columns for the first table. + * It is not initialized when pos = 0. + * @return + * @throws SemanticException + */ + private boolean isTableSorted( + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext pctx, + List keys, + QBJoinTree joinTree, + String alias, + int pos, + List sortColumnsFirstTable) throws SemanticException { + + HashMap> topOps = this.pGraphContext.getTopOps(); + Map topToTable = this.pGraphContext + .getTopToTable(); + TableScanOperator tso = (TableScanOperator) topOps.get(alias); + if (tso == null) { + return false; + } + + // get all join columns from join keys + List joinCols = new ArrayList(); + List joinKeys = new ArrayList(); + joinKeys.addAll(keys); + while (joinKeys.size() > 0) { + ExprNodeDesc node = joinKeys.remove(0); + if (node instanceof ExprNodeColumnDesc) { + joinCols.addAll(node.getCols()); + } else if (node instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node); + GenericUDF udf = udfNode.getGenericUDF(); + if (!FunctionRegistry.isDeterministic(udf)) { + return false; + } + joinKeys.addAll(0, udfNode.getChildExprs()); + } + } + + Table tbl = topToTable.get(tso); + if (tbl.isPartitioned()) { + PrunedPartitionList prunedParts = null; + try { + prunedParts = pGraphContext.getOpToPartList().get(tso); + if (prunedParts == null) { + prunedParts = PartitionPruner.prune(tbl, pGraphContext + .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias, + pGraphContext.getPrunedPartitions()); + pGraphContext.getOpToPartList().put(tso, prunedParts); + } + } catch (HiveException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + throw new SemanticException(e.getMessage(), e); + } + List partitions = prunedParts.getNotDeniedPartns(); + // Populate the names and order of columns for the first partition of the + // first table + if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) { + Partition firstPartition = partitions.get(0); + sortColumnsFirstTable.addAll(firstPartition.getSortCols()); + } + + for (Partition partition : prunedParts.getNotDeniedPartns()) { + if (!checkSortColsAndJoinCols(partition.getSortCols(), + joinCols, + sortColumnsFirstTable)) { + return false; + } + } + return true; + } + + // Populate the names and order of columns for the first table + if (pos == 0) { + sortColumnsFirstTable.addAll(tbl.getSortCols()); + } + + return checkSortColsAndJoinCols(tbl.getSortCols(), + joinCols, + sortColumnsFirstTable); + } + + private boolean checkSortColsAndJoinCols(List sortCols, + List joinCols, + List sortColumnsFirstPartition) { + + if (sortCols == null || sortCols.size() != joinCols.size()) { + return false; + } + + List sortColNames = new ArrayList(); + + // The join columns should contain all the sort columns + // The sort columns of all the tables should be in the same order + // compare the column names and the order with the first table/partition. + for (int pos = 0; pos < sortCols.size(); pos++) { + Order o = sortCols.get(pos); + + if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) { + return false; + } + sortColNames.add(o.getCol()); + } + + // The column names and order (ascending/descending) matched + // The join columns should contain sort columns + return sortColNames.containsAll(joinCols); + } + + // Can the join operator be converted to a sort-merge join operator ? + // It is already verified that the join can be converted to a bucket map join + protected boolean checkConvertJoinToSMBJoin( + JoinOperator joinOperator, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext pGraphContext) throws SemanticException { + + boolean tableSorted = true; + QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOperator); + + if (joinCtx == null) { + return false; + } + String[] srcs = joinCtx.getBaseSrc(); + int pos = 0; + + // All the tables/partitions columns should be sorted in the same order + // For example, if tables A and B are being joined on columns c1, c2 and c3 + // which are the sorted and bucketed columns. The join would work, as long + // c1, c2 and c3 are sorted in the same order. + List sortColumnsFirstTable = new ArrayList(); + + for (String src : srcs) { + tableSorted = tableSorted && + isTableSorted(smbJoinContext, + pGraphContext, + smbJoinContext.getKeyExprMap().get((byte)pos), + joinCtx, + src, + pos, + sortColumnsFirstTable); + pos++; + } + + smbJoinContext.setSrcs(srcs); + return true; + } + + // Can the join operator be converted to a sort-merge join operator ? + protected boolean canConvertJoinToSMBJoin( + JoinOperator joinOperator, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext pGraphContext) throws SemanticException { + boolean canConvert = + canConvertJoinToBucketMapJoin( + joinOperator, + pGraphContext, + smbJoinContext + ); + + if (!canConvert) { + return false; + } + + return checkConvertJoinToSMBJoin(joinOperator, smbJoinContext, pGraphContext); + } + + // Can the join operator be converted to a bucket map-merge join operator ? + protected boolean canConvertJoinToBucketMapJoin( + JoinOperator joinOp, + ParseContext pGraphContext, + SortBucketJoinOptProcCtx context) throws SemanticException { + + // This has already been inspected and rejected + if (context.getListOfRejectedJoins().contains(joinOp)) { + return false; + } + + QBJoinTree joinCtx = pGraphContext.getJoinContext().get(joinOp); + if (joinCtx == null) { + return false; + } + + Class bigTableMatcherClass = null; + try { + bigTableMatcherClass = + (Class) + (Class.forName(HiveConf.getVar(pGraphContext.getConf(), + HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN_BIGTABLE_MATCHER))); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e.getMessage()); + } + + SortMergeJoinBigTableMatcher bigTableMatcher = + (SortMergeJoinBigTableMatcher) ReflectionUtils.newInstance(bigTableMatcherClass, null); + int bigTablePosition = + bigTableMatcher.getBigTablePosition(pGraphContext, joinOp); + context.setBigTablePosition(bigTablePosition); + String joinAlias = + bigTablePosition == 0 ? + joinCtx.getLeftAlias() : joinCtx.getRightAliases()[bigTablePosition - 1]; + + Map> keyExprMap = new HashMap>(); + List> parentOps = joinOp.getParentOperators(); + // get the join keys from parent ReduceSink operators + for (Operator parentOp : parentOps) { + ReduceSinkDesc rsconf = ((ReduceSinkOperator)parentOp).getConf(); + Byte tag = (byte) rsconf.getTag(); + List keys = rsconf.getKeyCols(); + keyExprMap.put(tag, keys); + } + + context.setKeyExprMap(keyExprMap); + + // Given a candidate map-join, can this join be converted. + // The candidate map-join was derived from the pluggable sort merge join big + // table matcher. + return checkConvertBucketMapJoin( + pGraphContext, + context, + joinCtx, + keyExprMap, + joinAlias, + Arrays.asList(joinCtx.getBaseSrc())); + } + + // Convert the join operator to a bucket map-join join operator + protected MapJoinOperator convertJoinToBucketMapJoin( + JoinOperator joinOp, + SortBucketJoinOptProcCtx joinContext, + ParseContext parseContext) throws SemanticException { + MapJoinOperator mapJoinOp = MapJoinProcessor.convertMapJoin( + parseContext.getOpParseCtx(), + joinOp, + pGraphContext.getJoinContext().get(joinOp), + joinContext.getBigTablePosition(), + false); + // Remove the join operator from the query join context + parseContext.getMapJoinContext().put(mapJoinOp, parseContext.getJoinContext().get(joinOp)); + parseContext.getJoinContext().remove(joinOp); + convertMapJoinToBucketMapJoin(mapJoinOp, joinContext); + return mapJoinOp; + } + + // Convert the join operator to a sort-merge join operator + protected void convertJoinToSMBJoin( + JoinOperator joinOp, + SortBucketJoinOptProcCtx smbJoinContext, + ParseContext parseContext) throws SemanticException { + MapJoinOperator mapJoinOp = convertJoinToBucketMapJoin(joinOp, smbJoinContext, parseContext); + SMBMapJoinOperator smbMapJoinOp = + convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext, parseContext); + smbMapJoinOp.setConvertedJoin(true); + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortBucketJoinOptProcCtx.java (working copy) @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; + + +public class SortBucketJoinOptProcCtx extends BucketJoinOptProcCtx { + private String[] srcs; + private int bigTablePosition; + private Map> keyExprMap; + + public SortBucketJoinOptProcCtx(HiveConf conf) { + super(conf); + } + + public String[] getSrcs() { + return srcs; + } + + public void setSrcs(String[] srcs) { + this.srcs = srcs; + } + + public int getBigTablePosition() { + return bigTablePosition; + } + + public void setBigTablePosition(int bigTablePosition) { + this.bigTablePosition = bigTablePosition; + } + + public Map> getKeyExprMap() { + return keyExprMap; + } + + public void setKeyExprMap(Map> keyExprMap) { + this.keyExprMap = keyExprMap; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java (working copy) @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.ql.exec.OperatorFactory; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.ScriptOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; @@ -73,6 +74,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.serde.Constants; @@ -119,7 +121,7 @@ ArrayList smallTableAliasList = new ArrayList(); String bigTableAlias = null; - // create a new MapredLocalWork + // create a new MapredLocalWork MapredLocalWork newLocalWork = new MapredLocalWork( new LinkedHashMap>(), new LinkedHashMap()); @@ -202,7 +204,7 @@ // set alias to fetch work newLocalWork.getAliasToFetchWork().put(alias, fetchWork); } - // remove small table ailias from aliasToWork;Avoid concurrent modification + // remove small table alias from aliasToWork;Avoid concurrent modification for (String alias : smallTableAliasList) { newWork.getAliasToWork().remove(alias); } @@ -218,31 +220,25 @@ return bigTableAlias; } - public static String genMapJoinOpAndLocalWork(MapredWork newWork, JoinOperator op, int mapJoinPos) - throws SemanticException { - try { - LinkedHashMap, OpParseContext> opParseCtxMap = - newWork.getOpParseCtxMap(); - QBJoinTree newJoinTree = newWork.getJoinTree(); - // generate the map join operator; already checked the map join - MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op, - newJoinTree, mapJoinPos, true); - // generate the local work and return the big table alias - String bigTableAlias = MapJoinProcessor - .genMapJoinLocalWork(newWork, newMapJoinOp, mapJoinPos); - // clean up the mapred work - newWork.setOpParseCtxMap(null); - newWork.setJoinTree(null); + public static String genMapJoinOpAndLocalWork(MapredWork newWork, + MapJoinOperator newMapJoinOp, int mapJoinPos) + throws SemanticException { + try { + // generate the local work and return the big table alias + String bigTableAlias = MapJoinProcessor + .genMapJoinLocalWork(newWork, newMapJoinOp, mapJoinPos); + // clean up the mapred work + newWork.setOpParseCtxMap(null); + newWork.setJoinTree(null); - return bigTableAlias; + return bigTableAlias; - } catch (Exception e) { - e.printStackTrace(); - throw new SemanticException("Generate New MapJoin Opertor Exeception " + e.getMessage()); + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate New MapJoin Opertor Exeception " + e.getMessage()); + } } - } - /** * convert a regular join to a a map-side join. * @@ -257,7 +253,7 @@ * @param noCheckOuterJoin */ public static MapJoinOperator convertMapJoin( - LinkedHashMap, OpParseContext> opParseCtxMap, + Map, OpParseContext> opParseCtxMap, JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin) throws SemanticException { // outer join cannot be performed on a table which is being cached @@ -480,6 +476,66 @@ return mapJoinOp; } + + /** + * convert a sortmerge join to a a map-side join. + * + * @param opParseCtxMap + * @param op + * join operator + * @param joinTree + * qb join tree + * @param mapJoinPos + * position of the source to be read as part of map-reduce framework. All other sources + * are cached in memory + * @param noCheckOuterJoin + */ + public static MapJoinOperator convertMapJoin( + Map, OpParseContext> opParseCtxMap, + SMBMapJoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin) + throws SemanticException { + // Create a new map join operator + SMBJoinDesc smbJoinDesc = op.getConf(); + List keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0)); + TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(PlanUtils + .getFieldSchemasFromColumnList(keyCols, "mapjoinkey")); + MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), + keyTableDesc, smbJoinDesc.getExprs(), + smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), + smbJoinDesc.getOutputColumnNames(), + mapJoinPos, smbJoinDesc.getConds(), + smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix()); + + RowResolver joinRS = opParseCtxMap.get(op).getRowResolver(); + MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild( + mapJoinDesc, joinRS.getRowSchema(), + new ArrayList>()); + + OpParseContext ctx = new OpParseContext(joinRS); + opParseCtxMap.put(mapJoinOp, ctx); + + + // change the children of the original join operator to point to the map + // join operator + List> childOps = op.getChildOperators(); + for (Operator childOp : childOps) { + childOp.replaceParent(op, mapJoinOp); + } + mapJoinOp.setChildOperators(childOps); + op.setChildOperators(null); + + // change the parent of the original join operator to point to the map + // join operator + List> parentOps = op.getParentOperators(); + for (Operator parentOp : parentOps) { + parentOp.replaceChild(op, mapJoinOp); + } + mapJoinOp.setParentOperators(parentOps); + op.setParentOperators(null); + + return mapJoinOp; + } + public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator op, QBJoinTree joinTree, int mapJoinPos) throws SemanticException { HiveConf hiveConf = pctx.getConf(); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinResolver.java (working copy) @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.util.ArrayList; + +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; +import org.apache.hadoop.hive.ql.parse.SemanticException; + + +public class SortMergeJoinResolver implements PhysicalPlanResolver { + @Override + public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { + + // create dispatcher and graph walker + Dispatcher disp = new SortMergeJoinTaskDispatcher(pctx); + TaskGraphWalker ogw = new TaskGraphWalker(disp); + + // get all the tasks nodes from root task + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.rootTasks); + + // begin to walk through the task tree. + ogw.startWalking(topNodes, null); + return pctx; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java (working copy) @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/** + * Iterator each tasks. If this task has a local work,create a new task for this local work, named + * MapredLocalTask. then make this new generated task depends on current task's parent task, and + * make current task depends on this new generated task + */ +public class CommonJoinTaskDispatcher extends JoinTaskDispatcher implements Dispatcher { + + public CommonJoinTaskDispatcher(PhysicalContext context) { + super(context); + } + + @Override + public ConditionalTask processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException { + + // whether it contains common join op; if contains, return this common join op + JoinOperator joinOp = getJoinOp(currTask); + if (joinOp == null) { + return null; + } + currTask.setTaskTag(Task.COMMON_JOIN); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp); + + return processCurrentTask(currTask, currTask.getWork(), + conditionalTask, context, + joinTree, joinOp.getConf()); + } + + @Override + public MapJoinOperator getMapJoinOperator(MapRedTask task, + MapredWork work, QBJoinTree joinTree, int mapJoinPos) throws SemanticException { + JoinOperator newJoinOp = getJoinOp(task); + // generate the map join operator; already checked the map join + return MapJoinProcessor.convertMapJoin(work.getOpParseCtxMap(), newJoinOp, + joinTree, mapJoinPos, true); + } + + + private JoinOperator getJoinOp(MapRedTask task) throws SemanticException { + if (task.getWork() == null) { + return null; + } + Operator reducerOp = task.getWork().getReducer(); + if (reducerOp instanceof JoinOperator) { + return (JoinOperator) reducerOp; + } else { + return null; + } + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java (working copy) @@ -52,6 +52,10 @@ if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) { resolvers.add(new CommonJoinResolver()); } + if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN)) { + resolvers.add(new SortMergeJoinResolver()); + } + if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER)) { resolvers.add(new IndexWhereResolver()); } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SortMergeJoinTaskDispatcher.java (working copy) @@ -0,0 +1,229 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.FetchWork; +import org.apache.hadoop.hive.ql.plan.MapredLocalWork; +import org.apache.hadoop.hive.ql.plan.MapredWork; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; + +/** + * Iterator each tasks. If this task has a local work,create a new task for this local work, named + * MapredLocalTask. then make this new generated task depends on current task's parent task, and + * make current task depends on this new generated task + */ +public class SortMergeJoinTaskDispatcher extends JoinTaskDispatcher implements Dispatcher { + + public SortMergeJoinTaskDispatcher(PhysicalContext context) { + super(context); + } + + // Convert the work in the SMB plan to a regular join + // Note that the operator tree is not fixed, only the path/alias mappings in the + // plan are fixed + private void genSMBJoinWork(MapredWork currWork, SMBMapJoinOperator joinOp) { + // Remove the paths which are not part of aliasToPartitionInfo + Map aliasToPartitionInfo = currWork.getAliasToPartnInfo(); + List removePaths = new ArrayList(); + + for (Map.Entry> entry : currWork.getPathToAliases().entrySet()) { + boolean keepPath = false; + for (String alias : entry.getValue()) { + if (aliasToPartitionInfo.containsKey(alias)) { + keepPath = true; + break; + } + } + + // Remove if the path is not present + if (!keepPath) { + removePaths.add(entry.getKey()); + } + } + + List removeAliases = new ArrayList(); + for (String removePath : removePaths) { + removeAliases.addAll(currWork.getPathToAliases().get(removePath)); + currWork.getPathToAliases().remove(removePath); + currWork.getPathToPartitionInfo().remove(removePath); + } + + for (String alias : removeAliases) { + currWork.getAliasToPartnInfo().remove(alias); + currWork.getAliasToWork().remove(alias); + } + + // Get the MapredLocalWork + MapredLocalWork localWork = joinOp.getConf().getLocalWork(); + + for (Map.Entry> entry : localWork.getAliasToWork() + .entrySet()) { + String alias = entry.getKey(); + Operator op = entry.getValue(); + FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias); + + // Add the entry in mapredwork + currWork.getAliasToWork().put(alias, op); + + PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias); + if (fetchWork.getTblDir() != null) { + ArrayList aliases = new ArrayList(); + aliases.add(alias); + currWork.getPathToAliases().put(fetchWork.getTblDir(), aliases); + currWork.getPathToPartitionInfo().put(fetchWork.getTblDir(), partitionInfo); + } + else { + for (String pathDir : fetchWork.getPartDir()) { + ArrayList aliases = new ArrayList(); + aliases.add(alias); + currWork.getPathToAliases().put(pathDir, aliases); + currWork.getPathToPartitionInfo().put(pathDir, partitionInfo); + } + } + } + } + + private MapredWork convertSMBToJoin(MapredWork currWork, SMBMapJoinOperator oldJoinOp) + throws SemanticException { + try { + String xml = currWork.toXML(); + + // create map join task and set big table as i + // deep copy a new mapred work from xml + InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); + MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); + SMBMapJoinOperator newJoinOp = getSMBMapJoinOp(newWork); + + // Add the row resolver for the new operator + Map, OpParseContext> opParseContextMap = + physicalContext.getParseContext().getOpParseCtx(); + opParseContextMap.put(newJoinOp, opParseContextMap.get(oldJoinOp)); + // create a mapred task for this work + genSMBJoinWork(newWork, newJoinOp); + return newWork; + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + } + + @Override + public ConditionalTask processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException { + + // whether it contains common join op; if contains, return this common join op + SMBMapJoinOperator joinOp = getSMBMapJoinOp(currTask); + if (joinOp == null) { + return null; + } + if (!joinOp.isConvertedJoin()) { + return null; + } + currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN); + + MapredWork currWork = currTask.getWork(); + + // Convert the sort-merge join into a regular join. + // Then, convert all the join into a series of map-joins. + MapredWork currJoinWork = convertSMBToJoin(currWork, joinOp); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + QBJoinTree joinTree = parseCtx.getSmbMapJoinContext().get(joinOp); + + return processCurrentTask(currTask, currJoinWork, + conditionalTask, context, joinTree, joinOp.getConf()); + } + + private SMBMapJoinOperator getSMBMapJoinOp(Operator currOp) { + if (currOp instanceof SMBMapJoinOperator) { + return (SMBMapJoinOperator) currOp; + } + + List> childOps = currOp.getChildOperators(); + if ((childOps == null) || (childOps.isEmpty())) { + return null; + } + + for (Operator childOp : childOps) { + SMBMapJoinOperator ret = getSMBMapJoinOp(childOp); + if (ret != null) { + return ret; + } + } + + return null; + } + + private SMBMapJoinOperator getSMBMapJoinOp(MapredWork work) throws SemanticException { + for (Operator op : work.getAliasToWork().values()) { + SMBMapJoinOperator smbMapJoinOp = getSMBMapJoinOp(op); + if (smbMapJoinOp != null) { + return smbMapJoinOp; + } + } + + return null; + } + + private SMBMapJoinOperator getSMBMapJoinOp(MapRedTask task) throws SemanticException { + if (task.getWork() == null) { + return null; + } + return getSMBMapJoinOp(task.getWork()); + } + + + @Override + public MapJoinOperator getMapJoinOperator(MapRedTask task, + MapredWork work, QBJoinTree joinTree, int mapJoinPos) throws SemanticException { + SMBMapJoinOperator joinOp = getSMBMapJoinOp(task); + + // Add the row resolver for the new operator + Map,OpParseContext> opParseContextMap = + physicalContext.getParseContext().getOpParseCtx(); + opParseContextMap.put(joinOp, opParseContextMap.get(joinOp)); + + // generate the map join operator; already checked the map join + return MapJoinProcessor.convertMapJoin(work.getOpParseCtxMap(), joinOp, + joinTree, mapJoinPos, true); + + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/JoinTaskDispatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/JoinTaskDispatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/JoinTaskDispatcher.java (working copy) @@ -0,0 +1,275 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.physical; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ConditionalTask; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext; +import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; +import + org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; +import org.apache.hadoop.hive.ql.plan.ConditionalWork; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.MapredWork; + +/** + * Iterator each tasks. If this task has a local work,create a new task for this local work, named + * MapredLocalTask. then make this new generated task depends on current task's parent task, and + * make current task depends on this new generated task + */ +public abstract class JoinTaskDispatcher implements Dispatcher { + + protected final PhysicalContext physicalContext; + + public JoinTaskDispatcher(PhysicalContext context) { + super(); + physicalContext = context; + } + + protected ConditionalTask processCurrentTask(MapRedTask currTask, + MapredWork currWork, + ConditionalTask conditionalTask, Context context, + QBJoinTree joinTree, JoinDesc joinDesc) + throws SemanticException { + + // create conditional work list and task list + List listWorks = new ArrayList(); + List> listTasks = new ArrayList>(); + + // create alias to task mapping and alias to input file mapping for resolver + HashMap> aliasToTask = + new HashMap>(); + HashMap> pathToAliases = currTask.getWork().getPathToAliases(); + + // get parseCtx for this Join Operator + ParseContext parseCtx = physicalContext.getParseContext(); + + // start to generate multiple map join tasks + Byte[] order = joinDesc.getTagOrder(); + int numAliases = order.length; + + long aliasTotalKnownInputSize = 0; + HashMap aliasToSize = new HashMap(); + try { + // go over all the input paths, and calculate a known total size, known + // size for each input alias. + Utilities.getInputSummary(context, currWork, null).getLength(); + + // set alias to size mapping, this can be used to determine if one table + // is chosen as big table, what's the total size of left tables, which + // are going to be small tables. + for (Map.Entry> entry : pathToAliases.entrySet()) { + String path = entry.getKey(); + List aliasList = entry.getValue(); + ContentSummary cs = context.getCS(path); + if (cs != null) { + long size = cs.getLength(); + for (String alias : aliasList) { + aliasTotalKnownInputSize += size; + Long es = aliasToSize.get(alias); + if (es == null) { + es = new Long(0); + } + es += size; + aliasToSize.put(alias, es); + } + } + } + + HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc + .getConds()); + + // no table could be the big table; there is no need to convert + if (bigTableCandidates == null) { + return null; + } + currWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); + currWork.setJoinTree(joinTree); + + String xml = currWork.toXML(); + String bigTableAlias = null; + + long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(context.getConf(), + HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); + for (int i = 0; i < numAliases; i++) { + // this table cannot be big table + if (!bigTableCandidates.contains(i)) { + continue; + } + + // create map join task and set big table as i + // deep copy a new mapred work from xml + InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); + MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); + // create a mapred task for this work + MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext + .getParseContext().getConf()); + + // generate the map join operator; already checked the map join + MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, joinTree, i); + bigTableAlias = MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newMapJoinOp, i); + + Long aliasKnownSize = aliasToSize.get(bigTableAlias); + if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { + long smallTblTotalKnownSize = aliasTotalKnownInputSize + - aliasKnownSize.longValue(); + if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { + // this table is not good to be a big table. + continue; + } + } + + // add into conditional task + listWorks.add(newWork); + listTasks.add(newTask); + newTask.setTaskTag(Task.CONVERTED_MAPJOIN); + + // set up backup task + newTask.setBackupTask(currTask); + newTask.setBackupChildrenTasks(currTask.getChildTasks()); + + // put the mapping alias to task + aliasToTask.put(bigTableAlias, newTask); + } + } catch (Exception e) { + e.printStackTrace(); + throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); + } + + // insert current common join task to conditional task + listWorks.add(currTask.getWork()); + listTasks.add(currTask); + // clear JoinTree and OP Parse Context + currWork.setOpParseCtxMap(null); + currWork.setJoinTree(null); + + // create conditional task and insert conditional task into task tree + ConditionalWork cndWork = new ConditionalWork(listWorks); + ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf()); + cndTsk.setListTasks(listTasks); + + // set resolver and resolver context + cndTsk.setResolver(new ConditionalResolverCommonJoin()); + ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx(); + resolverCtx.setPathToAliases(pathToAliases); + resolverCtx.setAliasToKnownSize(aliasToSize); + resolverCtx.setAliasToTask(aliasToTask); + resolverCtx.setCommonJoinTask(currTask); + resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false)); + resolverCtx.setHdfsTmpDir(context.getMRScratchDir()); + cndTsk.setResolverCtx(resolverCtx); + + // replace the current task with the new generated conditional task + this.replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext); + return cndTsk; + } + + private void replaceTaskWithConditionalTask( + Task currTask, ConditionalTask cndTsk, + PhysicalContext physicalContext) { + // add this task into task tree + // set all parent tasks + List> parentTasks = currTask.getParentTasks(); + currTask.setParentTasks(null); + if (parentTasks != null) { + for (Task tsk : parentTasks) { + // make new generated task depends on all the parent tasks of current task. + tsk.addDependentTask(cndTsk); + // remove the current task from its original parent task's dependent task + tsk.removeDependentTask(currTask); + } + } else { + // remove from current root task and add conditional task to root tasks + physicalContext.removeFromRootTask(currTask); + physicalContext.addToRootTask(cndTsk); + } + // set all child tasks + List> oldChildTasks = currTask.getChildTasks(); + if (oldChildTasks != null) { + for (Task tsk : cndTsk.getListTasks()) { + if (tsk.equals(currTask)) { + continue; + } + for (Task oldChild : oldChildTasks) { + tsk.addDependentTask(oldChild); + } + } + } + } + + @Override + public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) + throws SemanticException { + if (nodeOutputs == null || nodeOutputs.length == 0) { + throw new SemanticException("No Dispatch Context"); + } + + TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0]; + + Task currTask = (Task) nd; + // not map reduce task or not conditional task, just skip + if (currTask.isMapRedTask()) { + if (currTask instanceof ConditionalTask) { + // get the list of task + List> taskList = ((ConditionalTask) currTask).getListTasks(); + for (Task tsk : taskList) { + if (tsk.isMapRedTask()) { + ConditionalTask cndTask = this.processCurrentTask((MapRedTask) tsk, + ((ConditionalTask) currTask), physicalContext.getContext()); + walkerCtx.addToDispatchList(cndTask); + } + } + } else { + ConditionalTask cndTask = this.processCurrentTask((MapRedTask) currTask, null, + physicalContext.getContext()); + walkerCtx.addToDispatchList(cndTask); + } + } + return null; + } + + public abstract ConditionalTask processCurrentTask(MapRedTask currTask, + ConditionalTask conditionalTask, Context context) + throws SemanticException; + + public abstract MapJoinOperator getMapJoinOperator(MapRedTask task, + MapredWork qork, QBJoinTree joinTree, int mapJoinPos) throws SemanticException; +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinResolver.java (working copy) @@ -17,41 +17,12 @@ */ package org.apache.hadoop.hive.ql.optimizer.physical; -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.io.Serializable; import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Stack; -import org.apache.hadoop.fs.ContentSummary; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.Context; -import org.apache.hadoop.hive.ql.exec.ConditionalTask; -import org.apache.hadoop.hive.ql.exec.JoinOperator; -import org.apache.hadoop.hive.ql.exec.MapRedTask; -import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.exec.TaskFactory; -import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; -import org.apache.hadoop.hive.ql.lib.TaskGraphWalker.TaskGraphWalkerContext; -import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor; -import org.apache.hadoop.hive.ql.parse.ParseContext; -import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin; -import - org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx; -import org.apache.hadoop.hive.ql.plan.ConditionalWork; -import org.apache.hadoop.hive.ql.plan.JoinDesc; -import org.apache.hadoop.hive.ql.plan.MapredWork; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; public class CommonJoinResolver implements PhysicalPlanResolver { @@ -70,239 +41,4 @@ ogw.startWalking(topNodes, null); return pctx; } - - - /** - * Iterator each tasks. If this task has a local work,create a new task for this local work, named - * MapredLocalTask. then make this new generated task depends on current task's parent task, and - * make current task depends on this new generated task - */ - class CommonJoinTaskDispatcher implements Dispatcher { - - private final PhysicalContext physicalContext; - - public CommonJoinTaskDispatcher(PhysicalContext context) { - super(); - physicalContext = context; - } - - private ConditionalTask processCurrentTask(MapRedTask currTask, - ConditionalTask conditionalTask, Context context) - throws SemanticException { - - // whether it contains common join op; if contains, return this common join op - JoinOperator joinOp = getJoinOp(currTask); - if (joinOp == null) { - return null; - } - currTask.setTaskTag(Task.COMMON_JOIN); - - MapredWork currWork = currTask.getWork(); - // create conditional work list and task list - List listWorks = new ArrayList(); - List> listTasks = new ArrayList>(); - - // create alias to task mapping and alias to input file mapping for resolver - HashMap> aliasToTask = new HashMap>(); - HashMap> pathToAliases = currTask.getWork().getPathToAliases(); - - // get parseCtx for this Join Operator - ParseContext parseCtx = physicalContext.getParseContext(); - QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp); - - // start to generate multiple map join tasks - JoinDesc joinDesc = joinOp.getConf(); - Byte[] order = joinDesc.getTagOrder(); - int numAliases = order.length; - - long aliasTotalKnownInputSize = 0; - HashMap aliasToSize = new HashMap(); - try { - // go over all the input paths, and calculate a known total size, known - // size for each input alias. - Utilities.getInputSummary(context, currWork, null).getLength(); - - // set alias to size mapping, this can be used to determine if one table - // is choosen as big table, what's the total size of left tables, which - // are going to be small tables. - for (Map.Entry> entry : pathToAliases.entrySet()) { - String path = entry.getKey(); - List aliasList = entry.getValue(); - ContentSummary cs = context.getCS(path); - if (cs != null) { - long size = cs.getLength(); - for (String alias : aliasList) { - aliasTotalKnownInputSize += size; - Long es = aliasToSize.get(alias); - if(es == null) { - es = new Long(0); - } - es += size; - aliasToSize.put(alias, es); - } - } - } - - HashSet bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds()); - - // no table could be the big table; there is no need to convert - if (bigTableCandidates == null) { - return null; - } - currWork.setOpParseCtxMap(parseCtx.getOpParseCtx()); - currWork.setJoinTree(joinTree); - - String xml = currWork.toXML(); - String bigTableAlias = null; - - long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(context.getConf(), - HiveConf.ConfVars.HIVESMALLTABLESFILESIZE); - for (int i = 0; i < numAliases; i++) { - // this table cannot be big table - if (!bigTableCandidates.contains(i)) { - continue; - } - - // create map join task and set big table as i - // deep copy a new mapred work from xml - InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8")); - MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf()); - // create a mapred task for this work - MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext - .getParseContext().getConf()); - JoinOperator newJoinOp = getJoinOp(newTask); - - // optimize this newWork and assume big table position is i - bigTableAlias = MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newJoinOp, i); - - Long aliasKnownSize = aliasToSize.get(bigTableAlias); - if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) { - long smallTblTotalKnownSize = aliasTotalKnownInputSize - - aliasKnownSize.longValue(); - if(smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) { - //this table is not good to be a big table. - continue; - } - } - - // add into conditional task - listWorks.add(newWork); - listTasks.add(newTask); - newTask.setTaskTag(Task.CONVERTED_MAPJOIN); - - //set up backup task - newTask.setBackupTask(currTask); - newTask.setBackupChildrenTasks(currTask.getChildTasks()); - - // put the mapping alias to task - aliasToTask.put(bigTableAlias, newTask); - } - } catch (Exception e) { - e.printStackTrace(); - throw new SemanticException("Generate Map Join Task Error: " + e.getMessage()); - } - - // insert current common join task to conditional task - listWorks.add(currTask.getWork()); - listTasks.add(currTask); - // clear JoinTree and OP Parse Context - currWork.setOpParseCtxMap(null); - currWork.setJoinTree(null); - - // create conditional task and insert conditional task into task tree - ConditionalWork cndWork = new ConditionalWork(listWorks); - ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf()); - cndTsk.setListTasks(listTasks); - - // set resolver and resolver context - cndTsk.setResolver(new ConditionalResolverCommonJoin()); - ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx(); - resolverCtx.setPathToAliases(pathToAliases); - resolverCtx.setAliasToKnownSize(aliasToSize); - resolverCtx.setAliasToTask(aliasToTask); - resolverCtx.setCommonJoinTask(currTask); - resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false)); - resolverCtx.setHdfsTmpDir(context.getMRScratchDir()); - cndTsk.setResolverCtx(resolverCtx); - - //replace the current task with the new generated conditional task - this.replaceTaskWithConditionalTask(currTask, cndTsk, physicalContext); - return cndTsk; - } - - private void replaceTaskWithConditionalTask( - Task currTask, ConditionalTask cndTsk, - PhysicalContext physicalContext) { - // add this task into task tree - // set all parent tasks - List> parentTasks = currTask.getParentTasks(); - currTask.setParentTasks(null); - if (parentTasks != null) { - for (Task tsk : parentTasks) { - // make new generated task depends on all the parent tasks of current task. - tsk.addDependentTask(cndTsk); - // remove the current task from its original parent task's dependent task - tsk.removeDependentTask(currTask); - } - } else { - // remove from current root task and add conditional task to root tasks - physicalContext.removeFromRootTask(currTask); - physicalContext.addToRootTask(cndTsk); - } - // set all child tasks - List> oldChildTasks = currTask.getChildTasks(); - if (oldChildTasks != null) { - for (Task tsk : cndTsk.getListTasks()) { - if (tsk.equals(currTask)) { - continue; - } - for (Task oldChild : oldChildTasks) { - tsk.addDependentTask(oldChild); - } - } - } - } - - @Override - public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) - throws SemanticException { - if (nodeOutputs == null || nodeOutputs.length == 0) { - throw new SemanticException("No Dispatch Context"); - } - - TaskGraphWalkerContext walkerCtx = (TaskGraphWalkerContext) nodeOutputs[0]; - - Task currTask = (Task) nd; - // not map reduce task or not conditional task, just skip - if (currTask.isMapRedTask()) { - if (currTask instanceof ConditionalTask) { - // get the list of task - List> taskList = ((ConditionalTask) currTask).getListTasks(); - for (Task tsk : taskList) { - if (tsk.isMapRedTask()) { - ConditionalTask cndTask = this.processCurrentTask((MapRedTask) tsk, - ((ConditionalTask) currTask), physicalContext.getContext()); - walkerCtx.addToDispatchList(cndTask); - } - } - } else { - ConditionalTask cndTask = this.processCurrentTask((MapRedTask) currTask, null, physicalContext.getContext()); - walkerCtx.addToDispatchList(cndTask); - } - } - return null; - } - - private JoinOperator getJoinOp(MapRedTask task) throws SemanticException { - if (task.getWork() == null) { - return null; - } - Operator reducerOp = task.getWork().getReducer(); - if (reducerOp instanceof JoinOperator) { - return (JoinOperator) reducerOp; - } else { - return null; - } - } - } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (working copy) @@ -28,13 +28,12 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.ql.ErrorMsg; -import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; -import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; @@ -44,20 +43,10 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.metadata.Partition; -import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.ParseContext; -import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; -import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; //try to replace a bucket map join with a sorted merge map join public class SortedMergeBucketMapJoinOptimizer implements Transform { @@ -68,9 +57,38 @@ public SortedMergeBucketMapJoinOptimizer() { } + private void getListOfRejectedJoins( + ParseContext pctx, SortBucketJoinOptProcCtx smbJoinContext) + throws SemanticException { + + // go through all joins - it should only contain selects, filters and scriptoperators + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", JoinOperator.getOperatorName() + "%"), + getCheckCandidateJoin()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + } + @Override public ParseContext transform(ParseContext pctx) throws SemanticException { + HiveConf conf = pctx.getConf(); + SortBucketJoinOptProcCtx smbJoinContext = + new SortBucketJoinOptProcCtx(conf); + // Get a list of joins which cannot be converted to a sort merge join + // Only selects and filters operators are allowed between the table scan and + // join currently. More operators can be added - the method supportAutomaticSortMergeJoin + // dictates which operator is allowed + getListOfRejectedJoins(pctx, smbJoinContext); + Map opRules = new LinkedHashMap(); // go through all map joins and find out all which have enabled bucket map // join. @@ -78,7 +96,15 @@ getSortedMergeBucketMapjoinProc(pctx)); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along - Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null); + + // There is no need for the user to specify mapjoin for it to be + // converted to sort-merge join + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTO_SORT_MERGE_JOIN)) { + opRules.put(new RuleRegExp("R2", "JOIN%"), + getSortedMergeJoinProc(pctx)); + } + + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, smbJoinContext); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes @@ -89,10 +115,41 @@ return pctx; } + class SortedMergeJoinProc extends AbstractSMBJoinProc implements NodeProcessor { + + public SortedMergeJoinProc(ParseContext pctx) { + super(pctx); + } + + public SortedMergeJoinProc() { + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + JoinOperator joinOp = (JoinOperator) nd; + SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx; + + boolean convert = + canConvertJoinToSMBJoin( + joinOp, smbJoinContext, pGraphContext); + + if (convert) { + convertJoinToSMBJoin(joinOp, smbJoinContext, pGraphContext); + } + return null; + } + } + private NodeProcessor getSortedMergeBucketMapjoinProc(ParseContext pctx) { return new SortedMergeBucketMapjoinProc(pctx); } + private NodeProcessor getSortedMergeJoinProc(ParseContext pctx) { + return new SortedMergeJoinProc(pctx); + } + private NodeProcessor getDefaultProc() { return new NodeProcessor() { @Override @@ -104,70 +161,57 @@ }; } - class SortedMergeBucketMapjoinProc implements NodeProcessor { - private ParseContext pGraphContext; + // check if the join operator encountered is a candidate for being converted + // to a sort-merge join + private NodeProcessor getCheckCandidateJoin() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx; + JoinOperator joinOperator = (JoinOperator)nd; + int size = stack.size(); + if (!(stack.get(size-1) instanceof JoinOperator) || + !(stack.get(size-2) instanceof ReduceSinkOperator)) { + smbJoinContext.getListOfRejectedJoins().add(joinOperator); + return null; + } + // If any operator in the stack does not support a auto-conversion, this join should + // not be converted. + for (int pos = size -3; pos >= 0; pos--) { + Operator op = (Operator)stack.get(pos); + if (!op.supportAutomaticSortMergeJoin()) { + smbJoinContext.getListOfRejectedJoins().add(joinOperator); + return null; + } + } + + return null; + } + }; + } + + class SortedMergeBucketMapjoinProc extends AbstractSMBJoinProc implements NodeProcessor { public SortedMergeBucketMapjoinProc(ParseContext pctx) { - this.pGraphContext = pctx; + super(pctx); } public SortedMergeBucketMapjoinProc() { } - // Return true or false based on whether the mapjoin was converted successfully to - // a sort-merge map join operator. - private boolean convertSMBJoin(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { if (nd instanceof SMBMapJoinOperator) { - return false; + return null; } + MapJoinOperator mapJoinOp = (MapJoinOperator) nd; - if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null - || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) { - return false; - } + SortBucketJoinOptProcCtx smbJoinContext = (SortBucketJoinOptProcCtx)procCtx; - boolean tableSorted = true; - QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext() - .get(mapJoinOp); - if (joinCxt == null) { - return false; - } - String[] srcs = joinCxt.getBaseSrc(); - int pos = 0; - - // All the tables/partitions columns should be sorted in the same order - // For example, if tables A and B are being joined on columns c1, c2 and c3 - // which are the sorted and bucketed columns. The join would work, as long - // c1, c2 and c3 are sorted in the same order. - List sortColumnsFirstTable = new ArrayList(); - - for (String src : srcs) { - tableSorted = tableSorted - && isTableSorted(this.pGraphContext, - mapJoinOp, - joinCxt, - src, - pos, - sortColumnsFirstTable); - pos++; - } - if (!tableSorted) { - //this is a mapjoin but not suit for a sort merge bucket map join. check outer joins - MapJoinProcessor.checkMapJoin(((MapJoinOperator) nd).getConf().getPosBigTable(), - ((MapJoinOperator) nd).getConf().getConds()); - return false; - } - // convert a bucket map join operator to a sorted merge bucket map join - // operator - convertToSMBJoin(mapJoinOp, srcs); - return true; - } - - @Override - public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - boolean convert = convertSMBJoin(nd, stack, procCtx, nodeOutputs); + boolean convert = + canConvertBucketMapJoinToSMBJoin(mapJoinOp, stack, smbJoinContext, nodeOutputs); // Throw an error if the user asked for sort merge bucketed mapjoin to be enforced // and sort merge bucketed mapjoin cannot be performed if (!convert && @@ -176,6 +220,9 @@ throw new SemanticException(ErrorMsg.SORTMERGE_MAPJOIN_FAILED.getMsg()); } + if (convert) { + convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext, pGraphContext); + } return null; } @@ -212,124 +259,5 @@ } return smbJop; } - - /** - * Whether this table is eligible for a sort-merge join. - * - * @param pctx parse context - * @param op map join operator being considered - * @param joinTree join tree being considered - * @param alias table alias in the join tree being checked - * @param pos position of the table - * @param sortColumnsFirstTable The names and order of the sorted columns for the first table. - * It is not initialized when pos = 0. - * @return - * @throws SemanticException - */ - private boolean isTableSorted(ParseContext pctx, - MapJoinOperator op, - QBJoinTree joinTree, - String alias, - int pos, - List sortColumnsFirstTable) - throws SemanticException { - - Map> topOps = this.pGraphContext - .getTopOps(); - Map topToTable = this.pGraphContext - .getTopToTable(); - TableScanOperator tso = (TableScanOperator) topOps.get(alias); - if (tso == null) { - return false; - } - - List keys = op.getConf().getKeys().get((byte) pos); - // get all join columns from join keys stored in MapJoinDesc - List joinCols = new ArrayList(); - List joinKeys = new ArrayList(); - joinKeys.addAll(keys); - while (joinKeys.size() > 0) { - ExprNodeDesc node = joinKeys.remove(0); - if (node instanceof ExprNodeColumnDesc) { - joinCols.addAll(node.getCols()); - } else if (node instanceof ExprNodeGenericFuncDesc) { - ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node); - GenericUDF udf = udfNode.getGenericUDF(); - if (!FunctionRegistry.isDeterministic(udf)) { - return false; - } - joinKeys.addAll(0, udfNode.getChildExprs()); - } - } - - Table tbl = topToTable.get(tso); - if (tbl.isPartitioned()) { - PrunedPartitionList prunedParts = null; - try { - prunedParts = pGraphContext.getOpToPartList().get(tso); - if (prunedParts == null) { - prunedParts = PartitionPruner.prune(tbl, pGraphContext - .getOpToPartPruner().get(tso), pGraphContext.getConf(), alias, - pGraphContext.getPrunedPartitions()); - pGraphContext.getOpToPartList().put(tso, prunedParts); - } - } catch (HiveException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - throw new SemanticException(e.getMessage(), e); - } - List partitions = prunedParts.getNotDeniedPartns(); - // Populate the names and order of columns for the first partition of the - // first table - if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) { - Partition firstPartition = partitions.get(0); - sortColumnsFirstTable.addAll(firstPartition.getSortCols()); - } - - for (Partition partition : prunedParts.getNotDeniedPartns()) { - if (!checkSortColsAndJoinCols(partition.getSortCols(), - joinCols, - sortColumnsFirstTable)) { - return false; - } - } - return true; - } - - // Populate the names and order of columns for the first table - if (pos == 0) { - sortColumnsFirstTable.addAll(tbl.getSortCols()); - } - - return checkSortColsAndJoinCols(tbl.getSortCols(), - joinCols, - sortColumnsFirstTable); - } - - private boolean checkSortColsAndJoinCols(List sortCols, - List joinCols, - List sortColumnsFirstPartition) { - - if (sortCols == null || sortCols.size() != joinCols.size()) { - return false; - } - - List sortColNames = new ArrayList(); - - // The join columns should contain all the sort columns - // The sort columns of all the tables should be in the same order - // compare the column names and the order with the first table/partition. - for (int pos = 0; pos < sortCols.size(); pos++) { - Order o = sortCols.get(pos); - if (o.getOrder() != sortColumnsFirstPartition.get(pos).getOrder()) { - return false; - } - sortColNames.add(o.getCol()); - } - - // The column names and order (ascending/descending) matched - // The join columns should contain sort columns - return sortColNames.containsAll(joinCols); - } } - } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/LeftSortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.parse.ParseContext; + +/* + * This is a pluggable policy to chose the candidate map-join table for converting a join to a + * sort merge join. The leftmost table is chosen as the join table. + */ +public class LeftSortMergeJoinBigTableMatcher implements SortMergeJoinBigTableMatcher { + public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) { + return 0; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy) @@ -74,12 +74,15 @@ } transformations.add(new SamplePruner()); transformations.add(new MapJoinProcessor()); - if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) { + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN) || + HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) { transformations.add(new BucketMapJoinOptimizer()); - if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) { - transformations.add(new SortedMergeBucketMapJoinOptimizer()); - } } + + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) { + transformations.add(new SortedMergeBucketMapJoinOptimizer()); + } + transformations.add(new UnionProcessor()); transformations.add(new JoinReorder()); if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) { Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketJoinOptProcCtx.java (working copy) @@ -0,0 +1,146 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.Partition; + +public class BucketJoinOptProcCtx implements NodeProcessorCtx { + private static final Log LOG = + LogFactory.getLog(BucketJoinOptProcCtx.class.getName()); + + private final HiveConf conf; + + // we only convert map joins that follows a root table scan in the same + // mapper. That means there is no reducer between the root table scan and + // mapjoin. + private Set listOfRejectedMapjoins = new HashSet(); + private Set listOfRejectedJoins = new HashSet(); + + // The list of join operators which can be converted to a bucketed map join + private Set listOfConvertedJoins = new HashSet(); + + private Map> aliasToPartitionBucketNumberMapping; + private Map>> aliasToPartitionBucketFileNamesMapping; + private Map> bigTblPartsToBucketFileNames; + private Map bigTblPartsToBucketNumber; + private List joinAliases; + private String baseBigAlias; + private boolean bigTablePartitioned; + + public BucketJoinOptProcCtx(HiveConf conf) { + this.conf = conf; + } + + public HiveConf getConf() { + return conf; + } + + public Set getListOfRejectedMapjoins() { + return listOfRejectedMapjoins; + } + + public void setListOfRejectedMapjoins(Set listOfRejectedMapjoins) { + this.listOfRejectedMapjoins = listOfRejectedMapjoins; + } + + public Set getListOfRejectedJoins() { + return listOfRejectedJoins; + } + + public Set getListOfConvertedJoins() { + return listOfConvertedJoins; + } + + public void setListOfRejectedJoins(Set listOfRejectedJoins) { + this.listOfRejectedJoins = listOfRejectedJoins; + } + + public void setListOfConvertedJoins(Set listOfConvertedJoins) { + this.listOfConvertedJoins = listOfConvertedJoins; + } + + public Map> getAliasToPartitionBucketNumberMapping() { + return aliasToPartitionBucketNumberMapping; + } + + public Map>> getAliasToPartitionBucketFileNamesMapping() { + return aliasToPartitionBucketFileNamesMapping; + } + + public Map> getBigTblPartsToBucketFileNames() { + return bigTblPartsToBucketFileNames; + } + + public Map getBigTblPartsToBucketNumber() { + return bigTblPartsToBucketNumber; + } + + public void setAliasToPartitionBucketNumberMapping( + Map> aliasToPartitionBucketNumberMapping) { + this.aliasToPartitionBucketNumberMapping = aliasToPartitionBucketNumberMapping; + } + + public void setAliasToPartitionBucketFileNamesMapping( + Map>> aliasToPartitionBucketFileNamesMapping) { + this.aliasToPartitionBucketFileNamesMapping = aliasToPartitionBucketFileNamesMapping; + } + + public void setBigTblPartsToBucketFileNames( + Map> bigTblPartsToBucketFileNames) { + this.bigTblPartsToBucketFileNames = bigTblPartsToBucketFileNames; + } + + public void setBigTblPartsToBucketNumber(Map bigTblPartsToBucketNumber) { + this.bigTblPartsToBucketNumber = bigTblPartsToBucketNumber; + } + + public void setJoinAliases(List joinAliases) { + this.joinAliases = joinAliases; + } + + public void setBaseBigAlias(String baseBigAlias) { + this.baseBigAlias = baseBigAlias; + } + + public List getJoinAliases() { + return joinAliases; + } + + public String getBaseBigAlias() { + return baseBigAlias; + } + + public boolean isBigTablePartitioned() { + return bigTablePartitioned; + } + + public void setBigTablePartitioned(boolean bigTablePartitioned) { + this.bigTablePartitioned = bigTablePartitioned; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +public interface SortMergeJoinBigTableMatcher { + public int getBigTablePosition(ParseContext parseContext, JoinOperator joinOp) + throws SemanticException; +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy) @@ -81,8 +81,8 @@ public ParseContext transform(ParseContext pctx) throws SemanticException { Map opRules = new LinkedHashMap(); - BucketMapjoinOptProcCtx bucketMapJoinOptimizeCtx = - new BucketMapjoinOptProcCtx(pctx.getConf()); + BucketJoinOptProcCtx bucketMapJoinOptimizeCtx = + new BucketJoinOptProcCtx(pctx.getConf()); // process map joins with no reducers pattern opRules.put(new RuleRegExp("R1", @@ -116,11 +116,11 @@ return new NodeProcessor () { @Override public Object process(Node nd, Stack stack, - NodeProcessorCtx procCtx, Object... nodeOutputs) + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { MapJoinOperator mapJoinOp = (MapJoinOperator) nd; - BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx; - context.listOfRejectedMapjoins.add(mapJoinOp); + BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx; + context.getListOfRejectedMapjoins().add(mapJoinOp); return null; } }; @@ -130,6 +130,10 @@ return new BucketMapjoinOptProc(pctx); } + private NodeProcessor getBucketjoinProc(ParseContext pctx) { + return new BucketMapjoinOptProc(pctx); + } + private NodeProcessor getDefaultProc() { return new NodeProcessor() { @Override @@ -141,228 +145,22 @@ }; } - class BucketMapjoinOptProc implements NodeProcessor { + class BucketMapjoinOptProc extends AbstractBucketJoinProc implements NodeProcessor { - protected ParseContext pGraphContext; - public BucketMapjoinOptProc(ParseContext pGraphContext) { - super(); - this.pGraphContext = pGraphContext; + super(pGraphContext); } - private boolean convertBucketMapJoin(Node nd, Stack stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { - MapJoinOperator mapJoinOp = (MapJoinOperator) nd; - BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx; - HiveConf conf = context.getConf(); - - if(context.getListOfRejectedMapjoins().contains(mapJoinOp)) { - return false; - } - - QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp); - if(joinCxt == null) { - return false; - } - - List joinAliases = new ArrayList(); - String[] srcs = joinCxt.getBaseSrc(); - String[] left = joinCxt.getLeftAliases(); - List mapAlias = joinCxt.getMapAliases(); - String baseBigAlias = null; - for(String s : left) { - if(s != null && !joinAliases.contains(s)) { - joinAliases.add(s); - if(!mapAlias.contains(s)) { - baseBigAlias = s; - } - } - } - for(String s : srcs) { - if(s != null && !joinAliases.contains(s)) { - joinAliases.add(s); - if(!mapAlias.contains(s)) { - baseBigAlias = s; - } - } - } - - MapJoinDesc mjDesc = mapJoinOp.getConf(); - LinkedHashMap> aliasToPartitionBucketNumberMapping = - new LinkedHashMap>(); - LinkedHashMap>> aliasToPartitionBucketFileNamesMapping = - new LinkedHashMap>>(); - - Map> topOps = - this.pGraphContext.getTopOps(); - Map topToTable = this.pGraphContext.getTopToTable(); - - // (partition to bucket file names) and (partition to bucket number) for - // the big table; - LinkedHashMap> bigTblPartsToBucketFileNames = new LinkedHashMap>(); - LinkedHashMap bigTblPartsToBucketNumber = new LinkedHashMap(); - - Integer[] orders = null; // accessing order of join cols to bucket cols, should be same - boolean bigTablePartitioned = true; - for (int index = 0; index < joinAliases.size(); index++) { - String alias = joinAliases.get(index); - TableScanOperator tso = (TableScanOperator) topOps.get(alias); - if (tso == null) { - return false; - } - List keys = toColumns(mjDesc.getKeys().get((byte) index)); - if (keys == null || keys.isEmpty()) { - return false; - } - if (orders == null) { - orders = new Integer[keys.size()]; - } - - Table tbl = topToTable.get(tso); - if(tbl.isPartitioned()) { - PrunedPartitionList prunedParts; - try { - prunedParts = pGraphContext.getOpToPartList().get(tso); - if (prunedParts == null) { - prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias, - pGraphContext.getPrunedPartitions()); - pGraphContext.getOpToPartList().put(tso, prunedParts); - } - } catch (HiveException e) { - // Has to use full name to make sure it does not conflict with - // org.apache.commons.lang.StringUtils - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); - throw new SemanticException(e.getMessage(), e); - } - List partitions = prunedParts.getNotDeniedPartns(); - // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number) - if (partitions.isEmpty()) { - if (!alias.equals(baseBigAlias)) { - aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList()); - aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>()); - } - } else { - List buckets = new ArrayList(); - List> files = new ArrayList>(); - for (Partition p : partitions) { - if (!checkBucketColumns(p.getBucketCols(), keys, orders)) { - return false; - } - List fileNames = getOnePartitionBucketFileNames(p.getDataLocation()); - // The number of files for the table should be same as number of buckets. - int bucketCount = p.getBucketCount(); - if (fileNames.size() != bucketCount) { - String msg = "The number of buckets for table " + - tbl.getTableName() + " partition " + p.getName() + " is " + - p.getBucketCount() + ", whereas the number of files is " + fileNames.size(); - throw new SemanticException( - ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); - } - if (alias.equals(baseBigAlias)) { - bigTblPartsToBucketFileNames.put(p, fileNames); - bigTblPartsToBucketNumber.put(p, bucketCount); - } else { - files.add(fileNames); - buckets.add(bucketCount); - } - } - if (!alias.equals(baseBigAlias)) { - aliasToPartitionBucketNumberMapping.put(alias, buckets); - aliasToPartitionBucketFileNamesMapping.put(alias, files); - } - } - } else { - if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) { - return false; - } - List fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation()); - Integer num = new Integer(tbl.getNumBuckets()); - // The number of files for the table should be same as number of buckets. - if (fileNames.size() != num) { - String msg = "The number of buckets for table " + - tbl.getTableName() + " is " + tbl.getNumBuckets() + - ", whereas the number of files is " + fileNames.size(); - throw new SemanticException( - ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); - } - if (alias.equals(baseBigAlias)) { - bigTblPartsToBucketFileNames.put(null, fileNames); - bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets()); - bigTablePartitioned = false; - } else { - aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num)); - aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames)); - } - } - } - - // All tables or partitions are bucketed, and their bucket number is - // stored in 'bucketNumbers', we need to check if the number of buckets in - // the big table can be divided by no of buckets in small tables. - for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) { - if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) { - return false; - } - } - - MapJoinDesc desc = mapJoinOp.getConf(); - - Map>> aliasBucketFileNameMapping = - new LinkedHashMap>>(); - - //sort bucket names for the big table - for(List partBucketNames : bigTblPartsToBucketFileNames.values()) { - Collections.sort(partBucketNames); - } - - // go through all small tables and get the mapping from bucket file name - // in the big table to bucket file names in small tables. - for (int j = 0; j < joinAliases.size(); j++) { - String alias = joinAliases.get(j); - if (alias.equals(baseBigAlias)) { - continue; - } - for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) { - Collections.sort(names); - } - List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias); - List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias); - - Map> mapping = new LinkedHashMap>(); - aliasBucketFileNameMapping.put(alias, mapping); - - // for each bucket file in big table, get the corresponding bucket file - // name in the small table. - //more than 1 partition in the big table, do the mapping for each partition - Iterator>> bigTblPartToBucketNames = - bigTblPartsToBucketFileNames.entrySet().iterator(); - Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber - .entrySet().iterator(); - while (bigTblPartToBucketNames.hasNext()) { - assert bigTblPartToBucketNum.hasNext(); - int bigTblBucketNum = bigTblPartToBucketNum.next().getValue(); - List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue(); - fillMapping(smallTblBucketNums, smallTblFilesList, - mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping()); - } - } - desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping); - desc.setBigTableAlias(baseBigAlias); - if (bigTablePartitioned) { - desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames)); - } - - return true; - } - - @Override @SuppressWarnings("unchecked") public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + BucketJoinOptProcCtx context = (BucketJoinOptProcCtx) procCtx; + MapJoinOperator mapJoinOperator = (MapJoinOperator)nd; - boolean convert = convertBucketMapJoin(nd, stack, procCtx, nodeOutputs); - BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx; + // can the mapjoin present be converted to a bucketed mapjoin + boolean convert = canConvertMapJoinToBucketMapJoin( + mapJoinOperator, pGraphContext, context); HiveConf conf = context.getConf(); // Throw an error if the user asked for bucketed mapjoin to be enforced and @@ -371,130 +169,13 @@ throw new SemanticException(ErrorMsg.BUCKET_MAPJOIN_NOT_POSSIBLE.getMsg()); } - return null; - } - - private List toColumns(List keys) { - List columns = new ArrayList(); - for (ExprNodeDesc key : keys) { - if (!(key instanceof ExprNodeColumnDesc)) { - return null; - } - columns.add(((ExprNodeColumnDesc) key).getColumn()); + if (convert) { + // convert the mapjoin to a bucketized mapjoin + convertMapJoinToBucketMapJoin(mapJoinOperator, context); } - return columns; - } - // convert partition to partition spec string - private Map> convert(Map> mapping) { - Map> converted = new HashMap>(); - for (Map.Entry> entry : mapping.entrySet()) { - converted.put(entry.getKey().getName(), entry.getValue()); - } - return converted; + return null; } - - // called for each partition of big table and populates mapping for each file in the partition - private void fillMapping( - List smallTblBucketNums, - List> smallTblFilesList, - Map> mapping, - int bigTblBucketNum, List bigTblBucketNameList, - Map bucketFileNameMapping) { - - for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) { - ArrayList resultFileNames = new ArrayList(); - for (int sindex = 0 ; sindex < smallTblBucketNums.size(); sindex++) { - int smallTblBucketNum = smallTblBucketNums.get(sindex); - List smallTblFileNames = smallTblFilesList.get(sindex); - if (bigTblBucketNum >= smallTblBucketNum) { - // if the big table has more buckets than the current small table, - // use "MOD" to get small table bucket names. For example, if the big - // table has 4 buckets and the small table has 2 buckets, then the - // mapping should be 0->0, 1->1, 2->0, 3->1. - int toAddSmallIndex = bindex % smallTblBucketNum; - resultFileNames.add(smallTblFileNames.get(toAddSmallIndex)); - } else { - int jump = smallTblBucketNum / bigTblBucketNum; - for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) { - resultFileNames.add(smallTblFileNames.get(i)); - } - } - } - String inputBigTBLBucket = bigTblBucketNameList.get(bindex); - mapping.put(inputBigTBLBucket, resultFileNames); - bucketFileNameMapping.put(inputBigTBLBucket, bindex); - } - } - - private boolean checkBucketNumberAgainstBigTable( - Map> aliasToBucketNumber, int bucketNumberInPart) { - for (List bucketNums : aliasToBucketNumber.values()) { - for (int nxt : bucketNums) { - boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0 - : bucketNumberInPart % nxt == 0; - if (!ok) { - return false; - } - } - } - return true; - } - - private List getOnePartitionBucketFileNames(URI location) - throws SemanticException { - List fileNames = new ArrayList(); - try { - FileSystem fs = FileSystem.get(location, this.pGraphContext.getConf()); - FileStatus[] files = fs.listStatus(new Path(location.toString())); - if (files != null) { - for (FileStatus file : files) { - fileNames.add(file.getPath().toString()); - } - } - } catch (IOException e) { - throw new SemanticException(e); - } - return fileNames; - } - - private boolean checkBucketColumns(List bucketColumns, List keys, - Integer[] orders) { - if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) { - return false; - } - for (int i = 0; i < keys.size(); i++) { - int index = bucketColumns.indexOf(keys.get(i)); - if (orders[i] != null && orders[i] != index) { - return false; - } - orders[i] = index; - } - // Check if the join columns contains all bucket columns. - // If a table is bucketized on column B, but the join key is A and B, - // it is easy to see joining on different buckets yield empty results. - return keys.containsAll(bucketColumns); - } } - class BucketMapjoinOptProcCtx implements NodeProcessorCtx { - private final HiveConf conf; - - // we only convert map joins that follows a root table scan in the same - // mapper. That means there is no reducer between the root table scan and - // mapjoin. - Set listOfRejectedMapjoins = new HashSet(); - - public BucketMapjoinOptProcCtx(HiveConf conf) { - this.conf = conf; - } - - public HiveConf getConf() { - return conf; - } - - public Set getListOfRejectedMapjoins() { - return listOfRejectedMapjoins; - } - } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractBucketJoinProc.java (working copy) @@ -0,0 +1,442 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/** + *this transformation does bucket map join optimization. + */ +abstract public class AbstractBucketJoinProc implements NodeProcessor { + private static final Log LOG = + LogFactory.getLog(AbstractBucketJoinProc.class.getName()); + + protected ParseContext pGraphContext; + + public AbstractBucketJoinProc(ParseContext pGraphContext) { + this.pGraphContext = pGraphContext; + } + + public AbstractBucketJoinProc() { + } + + @Override + abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException; + + private static List getOnePartitionBucketFileNames( + URI location, ParseContext pGraphContext) throws SemanticException { + List fileNames = new ArrayList(); + try { + FileSystem fs = FileSystem.get(location, pGraphContext.getConf()); + FileStatus[] files = fs.listStatus(new Path(location.toString())); + if (files != null) { + for (FileStatus file : files) { + fileNames.add(file.getPath().toString()); + } + } + } catch (IOException e) { + throw new SemanticException(e); + } + return fileNames; + } + + private boolean checkBucketColumns(List bucketColumns, + List keys, + Integer[] orders) { + if (keys == null || bucketColumns == null || bucketColumns.isEmpty()) { + return false; + } + for (int i = 0; i < keys.size(); i++) { + int index = bucketColumns.indexOf(keys.get(i)); + if (orders[i] != null && orders[i] != index) { + return false; + } + orders[i] = index; + } + + // Check if the join columns contains all bucket columns. + // If a table is bucketized on column B, but the join key is A and B, + // it is easy to see joining on different buckets yield empty results. + return keys.containsAll(bucketColumns); + } + + private boolean checkBucketNumberAgainstBigTable( + Map> aliasToBucketNumber, int bucketNumberInPart) { + for (List bucketNums : aliasToBucketNumber.values()) { + for (int nxt : bucketNums) { + boolean ok = (nxt >= bucketNumberInPart) ? nxt % bucketNumberInPart == 0 + : bucketNumberInPart % nxt == 0; + if (!ok) { + return false; + } + } + } + return true; + } + + protected boolean canConvertMapJoinToBucketMapJoin( + MapJoinOperator mapJoinOp, + ParseContext pGraphContext, + BucketJoinOptProcCtx context) throws SemanticException { + + if (context.getListOfRejectedMapjoins().contains(mapJoinOp)) { + return false; + } + + QBJoinTree joinCtx = this.pGraphContext.getMapJoinContext().get(mapJoinOp); + if (joinCtx == null) { + return false; + } + + List joinAliases = new ArrayList(); + String[] srcs = joinCtx.getBaseSrc(); + String[] left = joinCtx.getLeftAliases(); + List mapAlias = joinCtx.getMapAliases(); + String baseBigAlias = null; + for (String s : left) { + if (s != null && !joinAliases.contains(s)) { + joinAliases.add(s); + if (!mapAlias.contains(s)) { + baseBigAlias = s; + } + } + } + + for (String s : srcs) { + if (s != null && !joinAliases.contains(s)) { + joinAliases.add(s); + if (!mapAlias.contains(s)) { + baseBigAlias = s; + } + } + } + + Map> keysMap = mapJoinOp.getConf().getKeys(); + + return checkConvertBucketMapJoin( + pGraphContext, + context, + joinCtx, + keysMap, + baseBigAlias, + joinAliases); + } + + protected boolean checkConvertBucketMapJoin( + ParseContext pGraphContext, + BucketJoinOptProcCtx context, + QBJoinTree joinCtx, + Map> keysMap, + String baseBigAlias, + List joinAliases) throws SemanticException { + + Map> aliasToPartitionBucketNumberMapping = + new LinkedHashMap>(); + Map>> aliasToPartitionBucketFileNamesMapping = + new LinkedHashMap>>(); + + HashMap> topOps = pGraphContext.getTopOps(); + Map topToTable = pGraphContext.getTopToTable(); + + // (partition to bucket file names) and (partition to bucket number) for + // the big table; + Map> bigTblPartsToBucketFileNames = + new LinkedHashMap>(); + Map bigTblPartsToBucketNumber = + new LinkedHashMap(); + + Integer[] orders = null; // accessing order of join cols to bucket cols, should be same + boolean bigTablePartitioned = true; + for (int index = 0; index < joinAliases.size(); index++) { + String alias = joinAliases.get(index); + TableScanOperator tso = (TableScanOperator) topOps.get(alias); + if (tso == null) { + return false; + } + Table tbl = topToTable.get(tso); + + List keys = toColumns(keysMap.get((byte) index)); + if (keys == null || keys.isEmpty()) { + return false; + } + if (orders == null) { + orders = new Integer[keys.size()]; + } + + if (tbl.isPartitioned()) { + PrunedPartitionList prunedParts; + try { + prunedParts = pGraphContext.getOpToPartList().get(tso); + if (prunedParts == null) { + prunedParts = + PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), + pGraphContext.getConf(), alias, + pGraphContext.getPrunedPartitions()); + pGraphContext.getOpToPartList().put(tso, prunedParts); + } + } catch (HiveException e) { + // Has to use full name to make sure it does not conflict with + // org.apache.commons.lang.StringUtils + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + throw new SemanticException(e.getMessage(), e); + } + List partitions = prunedParts.getNotDeniedPartns(); + // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number) + if (partitions.isEmpty()) { + if (!alias.equals(baseBigAlias)) { + aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList()); + aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>()); + } + } else { + List buckets = new ArrayList(); + List> files = new ArrayList>(); + for (Partition p : partitions) { + if (!checkBucketColumns(p.getBucketCols(), keys, orders)) { + return false; + } + List fileNames = + getOnePartitionBucketFileNames(p.getDataLocation(), pGraphContext); + // The number of files for the table should be same as number of buckets. + int bucketCount = p.getBucketCount(); + + if (fileNames.size() != bucketCount) { + String msg = "The number of buckets for table " + + tbl.getTableName() + " partition " + p.getName() + " is " + + p.getBucketCount() + ", whereas the number of files is " + fileNames.size(); + throw new SemanticException( + ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); + } + + if (alias.equals(baseBigAlias)) { + bigTblPartsToBucketFileNames.put(p, fileNames); + bigTblPartsToBucketNumber.put(p, bucketCount); + } else { + files.add(fileNames); + buckets.add(bucketCount); + } + } + if (!alias.equals(baseBigAlias)) { + aliasToPartitionBucketNumberMapping.put(alias, buckets); + aliasToPartitionBucketFileNamesMapping.put(alias, files); + } + } + } else { + if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) { + return false; + } + List fileNames = + getOnePartitionBucketFileNames(tbl.getDataLocation(), pGraphContext); + Integer num = new Integer(tbl.getNumBuckets()); + + // The number of files for the table should be same as number of buckets. + if (fileNames.size() != num) { + String msg = "The number of buckets for table " + + tbl.getTableName() + " is " + tbl.getNumBuckets() + + ", whereas the number of files is " + fileNames.size(); + throw new SemanticException( + ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg)); + } + + if (alias.equals(baseBigAlias)) { + bigTblPartsToBucketFileNames.put(null, fileNames); + bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets()); + bigTablePartitioned = false; + } else { + aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num)); + aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames)); + } + } + } + + // All tables or partitions are bucketed, and their bucket number is + // stored in 'bucketNumbers', we need to check if the number of buckets in + // the big table can be divided by no of buckets in small tables. + for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) { + if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) { + return false; + } + } + + context.setAliasToPartitionBucketNumberMapping(aliasToPartitionBucketNumberMapping); + context.setAliasToPartitionBucketFileNamesMapping(aliasToPartitionBucketFileNamesMapping); + context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames); + context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber); + context.setJoinAliases(joinAliases); + context.setBaseBigAlias(baseBigAlias); + context.setBigTablePartitioned(bigTablePartitioned); + + return true; + } + + protected void convertMapJoinToBucketMapJoin( + MapJoinOperator mapJoinOp, + BucketJoinOptProcCtx context) throws SemanticException { + MapJoinDesc desc = mapJoinOp.getConf(); + + Map>> aliasBucketFileNameMapping = + new LinkedHashMap>>(); + + Map> aliasToPartitionBucketNumberMapping = + context.getAliasToPartitionBucketNumberMapping(); + + Map>> aliasToPartitionBucketFileNamesMapping = + context.getAliasToPartitionBucketFileNamesMapping(); + + Map> bigTblPartsToBucketFileNames = + context.getBigTblPartsToBucketFileNames(); + + Map bigTblPartsToBucketNumber = + context.getBigTblPartsToBucketNumber(); + + List joinAliases = context.getJoinAliases(); + String baseBigAlias = context.getBaseBigAlias(); + + // sort bucket names for the big table + for (List partBucketNames : bigTblPartsToBucketFileNames.values()) { + Collections.sort(partBucketNames); + } + + // go through all small tables and get the mapping from bucket file name + // in the big table to bucket file names in small tables. + for (int j = 0; j < joinAliases.size(); j++) { + String alias = joinAliases.get(j); + if (alias.equals(baseBigAlias)) { + continue; + } + for (List names : aliasToPartitionBucketFileNamesMapping.get(alias)) { + Collections.sort(names); + } + List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias); + List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias); + + Map> mapping = new LinkedHashMap>(); + aliasBucketFileNameMapping.put(alias, mapping); + + // for each bucket file in big table, get the corresponding bucket file + // name in the small table. + // more than 1 partition in the big table, do the mapping for each partition + Iterator>> bigTblPartToBucketNames = + bigTblPartsToBucketFileNames.entrySet().iterator(); + Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber + .entrySet().iterator(); + while (bigTblPartToBucketNames.hasNext()) { + assert bigTblPartToBucketNum.hasNext(); + int bigTblBucketNum = bigTblPartToBucketNum.next().getValue(); + List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue(); + fillMapping(smallTblBucketNums, smallTblFilesList, + mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping()); + } + } + desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping); + desc.setBigTableAlias(baseBigAlias); + boolean bigTablePartitioned = context.isBigTablePartitioned(); + if (bigTablePartitioned) { + desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames)); + } + } + + // convert partition to partition spec string + private static Map> convert(Map> mapping) { + Map> converted = new HashMap>(); + for (Map.Entry> entry : mapping.entrySet()) { + converted.put(entry.getKey().getName(), entry.getValue()); + } + return converted; + } + + private List toColumns(List keys) { + List columns = new ArrayList(); + for (ExprNodeDesc key : keys) { + if (!(key instanceof ExprNodeColumnDesc)) { + return null; + } + columns.add(((ExprNodeColumnDesc) key).getColumn()); + } + return columns; + } + + // called for each partition of big table and populates mapping for each file in the partition + private static void fillMapping( + List smallTblBucketNums, + List> smallTblFilesList, + Map> mapping, + int bigTblBucketNum, List bigTblBucketNameList, + Map bucketFileNameMapping) { + + for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) { + ArrayList resultFileNames = new ArrayList(); + for (int sindex = 0 ; sindex < smallTblBucketNums.size(); sindex++) { + int smallTblBucketNum = smallTblBucketNums.get(sindex); + List smallTblFileNames = smallTblFilesList.get(sindex); + if (bigTblBucketNum >= smallTblBucketNum) { + // if the big table has more buckets than the current small table, + // use "MOD" to get small table bucket names. For example, if the big + // table has 4 buckets and the small table has 2 buckets, then the + // mapping should be 0->0, 1->1, 2->0, 3->1. + int toAddSmallIndex = bindex % smallTblBucketNum; + resultFileNames.add(smallTblFileNames.get(toAddSmallIndex)); + } else { + int jump = smallTblBucketNum / bigTblBucketNum; + for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) { + resultFileNames.add(smallTblFileNames.get(i)); + } + } + } + String inputBigTBLBucket = bigTblBucketNameList.get(bindex); + mapping.put(inputBigTBLBucket, resultFileNames); + bucketFileNameMapping.put(inputBigTBLBucket, bindex); + } + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SizeSortMergeJoinBigTableMatcher.java (working copy) @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/* + * This is a pluggable policy to chose the candidate map-join table for converting a join to a + * sort merge join. The largest table is chosen based on the size of the tables. + */ +public class SizeSortMergeJoinBigTableMatcher implements SortMergeJoinBigTableMatcher { + private void getListTopOps( + Operator op, List topOps) { + if ((op.getParentOperators() == null) || + (op.getParentOperators().isEmpty())) { + return; + } + + for (Operator parentOp : op.getParentOperators()) { + if (parentOp instanceof TableScanOperator) { + topOps.add((TableScanOperator)parentOp); + } + else { + getListTopOps(parentOp, topOps); + } + } + } + + private long getSize(HiveConf conf, String size, Path path) { + // If the size is present in the metastore, use it + if (size != null) { + try { + return Long.valueOf(size); + } catch (NumberFormatException e) { + return 0; + } + } + + try { + FileSystem fs = path.getFileSystem(conf); + return fs.getContentSummary(path).getLength(); + } catch (Exception e) { + return 0; + } + } + + private long getSize(HiveConf conf, Table table) { + Path path = table.getPath(); + String size = table.getProperty("totalSize"); + return getSize(conf, size, path); + } + + private long getSize(HiveConf conf, Partition partition) { + Path path = partition.getPartitionPath(); + String size = partition.getParameters().get("totalSize"); + + return getSize(conf, size, path); + } + + public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp) + throws SemanticException { + int bigTablePos = 0; + long maxSize = 0; + HiveConf conf = parseCtx.getConf(); + + try { + List topOps = new ArrayList(); + getListTopOps(joinOp, topOps); + int currentPos = 0; + for (TableScanOperator topOp : topOps) { + Table table = parseCtx.getTopToTable().get(topOp); + long currentSize = 0; + + if (!table.isPartitioned()) { + currentSize = getSize(conf, table); + } + else { + // For partitioned tables, get the size of all the partitions + PrunedPartitionList partsList = + PartitionPruner.prune(parseCtx.getTopToTable().get(topOp), + parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(), + null, parseCtx.getPrunedPartitions()); + for (Partition part : partsList.getNotDeniedPartns()) { + currentSize += getSize(conf, part); + } + } + + if (currentSize > maxSize) { + maxSize = currentSize; + bigTablePos = currentPos; + } + currentPos++; + } + } catch (HiveException e) { + throw new SemanticException(e.getMessage()); + } + + return bigTablePos; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java (working copy) @@ -76,6 +76,7 @@ public static final int CONVERTED_LOCAL_MAPJOIN = 3; public static final int BACKUP_COMMON_JOIN = 4; public static final int LOCAL_MAPJOIN=5; + public static final int CONVERTED_SORTMERGEJOIN = 6; // Descendants tasks who subscribe feeds from this task Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy) @@ -172,6 +172,11 @@ } @Override + public boolean supportAutomaticSortMergeJoin() { + return true; + } + + @Override public boolean supportUnionRemoveOptimization() { return true; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (working copy) @@ -76,6 +76,8 @@ private transient boolean inputFileChanged = false; transient boolean localWorkInited = false; + private transient boolean convertedJoin = false; + public SMBMapJoinOperator() { } @@ -607,6 +609,14 @@ return OperatorType.MAPJOIN; } + public boolean isConvertedJoin() { + return convertedJoin; + } + + public void setConvertedJoin(boolean convertedJoin) { + this.convertedJoin = convertedJoin; + } + // returns rows from possibly multiple bucket files of small table in ascending order // by utilizing primary queue (borrowed from hadoop) // elements of queue (Integer) are index to FetchOperator[] (segments) @@ -682,6 +692,7 @@ } } + @Override protected boolean lessThan(Object a, Object b) { return compareKeys(keys[(Integer) a].getFirst(), keys[(Integer)b].getFirst()) < 0; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy) @@ -112,6 +112,11 @@ } @Override + public boolean supportAutomaticSortMergeJoin() { + return true; + } + + @Override public boolean supportUnionRemoveOptimization() { return true; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy) @@ -283,4 +283,9 @@ public boolean supportSkewJoinOptimization() { return true; } + + @Override + public boolean supportAutomaticSortMergeJoin() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -1426,6 +1426,15 @@ this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat; } + /** + * Whether this operator supports automatic sort merge join. + * The stack is traversed, and this method is invoked for all the operators. + * @return TRUE if yes, FALSE otherwise. + */ + public boolean supportAutomaticSortMergeJoin() { + return false; + } + public boolean supportUnionRemoveOptimization() { return false; } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/SMBJoinDesc.java (working copy) @@ -25,10 +25,10 @@ public class SMBJoinDesc extends MapJoinDesc implements Serializable { private static final long serialVersionUID = 1L; - + private MapredLocalWork localWork; - - //keep a mapping from tag to the fetch operator alias + + // keep a mapping from tag to the fetch operator alias private HashMap tagToAlias; public SMBJoinDesc(MapJoinDesc conf) { @@ -53,5 +53,4 @@ public void setTagToAlias(HashMap tagToAlias) { this.tagToAlias = tagToAlias; } - } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -71,6 +71,7 @@ import org.apache.hadoop.hive.ql.exec.RecordWriter; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.StatsTask; import org.apache.hadoop.hive.ql.exec.TableScanOperator; @@ -193,6 +194,7 @@ private List loadTableWork; private List loadFileWork; private Map joinContext; + private Map smbMapJoinContext; private final HashMap topToTable; private QB qb; private ASTNode ast; @@ -242,6 +244,7 @@ loadFileWork = new ArrayList(); opParseCtx = new LinkedHashMap, OpParseContext>(); joinContext = new HashMap(); + smbMapJoinContext = new HashMap(); topToTable = new HashMap(); destTableId = 1; uCtx = null; @@ -270,6 +273,7 @@ ast = null; uCtx = null; joinContext.clear(); + smbMapJoinContext.clear(); opParseCtx.clear(); groupOpToInputTables.clear(); prunedPartitions.clear(); @@ -285,6 +289,7 @@ loadTableWork = pctx.getLoadTableWork(); loadFileWork = pctx.getLoadFileWork(); joinContext = pctx.getJoinContext(); + smbMapJoinContext = pctx.getSmbMapJoinContext(); ctx = pctx.getContext(); destTableId = pctx.getDestTableId(); idToTableNameMap = pctx.getIdToTableNameMap(); @@ -299,7 +304,7 @@ public ParseContext getParseContext() { return new ParseContext(conf, qb, ast, opToPartPruner, opToPartList, topOps, - topSelOps, opParseCtx, joinContext, topToTable, loadTableWork, + topSelOps, opParseCtx, joinContext, smbMapJoinContext, topToTable, loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, @@ -8083,7 +8088,8 @@ } ParseContext pCtx = new ParseContext(conf, qb, child, opToPartPruner, - opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable, + opToPartList, topOps, topSelOps, opParseCtx, joinContext, smbMapJoinContext, + topToTable, loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (revision 1406918) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (working copy) @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.hooks.LineageInfo; @@ -71,6 +72,7 @@ private LinkedHashMap, OpParseContext> opParseCtx; private Map joinContext; private Map mapJoinContext; + private Map smbMapJoinContext; private HashMap topToTable; private HashMap nameToSplitSample; private List loadTableWork; @@ -159,6 +161,7 @@ HashMap> topSelOps, LinkedHashMap, OpParseContext> opParseCtx, Map joinContext, + Map smbMapJoinContext, HashMap topToTable, List loadTableWork, List loadFileWork, Context ctx, HashMap idToTableNameMap, int destTableId, @@ -176,6 +179,7 @@ this.opToPartPruner = opToPartPruner; this.opToPartList = opToPartList; this.joinContext = joinContext; + this.smbMapJoinContext = smbMapJoinContext; this.topToTable = topToTable; this.loadFileWork = loadFileWork; this.loadTableWork = loadTableWork; @@ -525,6 +529,14 @@ this.mapJoinContext = mapJoinContext; } + public Map getSmbMapJoinContext() { + return smbMapJoinContext; + } + + public void setSmbMapJoinContext(Map smbMapJoinContext) { + this.smbMapJoinContext = smbMapJoinContext; + } + public GlobalLimitCtx getGlobalLimitCtx() { return globalLimitCtx; }