Index: data/files/srcsortbucket1outof4.txt =================================================================== --- data/files/srcsortbucket1outof4.txt (revision 0) +++ data/files/srcsortbucket1outof4.txt (working copy) @@ -0,0 +1,118 @@ +0val_0 +0val_0 +0val_0 +103val_103 +103val_103 +11val_11 +114val_114 +118val_118 +118val_118 +125val_125 +125val_125 +129val_129 +129val_129 +136val_136 +143val_143 +15val_15 +15val_15 +150val_150 +158val_158 +165val_165 +165val_165 +169val_169 +169val_169 +169val_169 +169val_169 +172val_172 +172val_172 +176val_176 +176val_176 +183val_183 +187val_187 +187val_187 +187val_187 +19val_19 +190val_190 +194val_194 +202val_202 +213val_213 +213val_213 +217val_217 +217val_217 +224val_224 +224val_224 +228val_228 +235val_235 +239val_239 +239val_239 +242val_242 +242val_242 +257val_257 +26val_26 +26val_26 +260val_260 +275val_275 +282val_282 +282val_282 +286val_286 +305val_305 +309val_309 +309val_309 +316val_316 +316val_316 +316val_316 +323val_323 +327val_327 +327val_327 +327val_327 +33val_33 +338val_338 +341val_341 +345val_345 +356val_356 +367val_367 +367val_367 +37val_37 +37val_37 +374val_374 +378val_378 +389val_389 +392val_392 +396val_396 +396val_396 +396val_396 +4val_4 +400val_400 +404val_404 +404val_404 +411val_411 +419val_419 +437val_437 +44val_44 +444val_444 +448val_448 +455val_455 +459val_459 +459val_459 +462val_462 +462val_462 +466val_466 +466val_466 +466val_466 +477val_477 +480val_480 +480val_480 +480val_480 +484val_484 +491val_491 +495val_495 +51val_51 +51val_51 +66val_66 +77val_77 +8val_8 +80val_80 +84val_84 +84val_84 +95val_95 +95val_95 Index: data/files/srcsortbucket2outof4.txt =================================================================== --- data/files/srcsortbucket2outof4.txt (revision 0) +++ data/files/srcsortbucket2outof4.txt (working copy) @@ -0,0 +1,120 @@ +100val_100 +100val_100 +104val_104 +104val_104 +111val_111 +119val_119 +119val_119 +119val_119 +12val_12 +12val_12 +126val_126 +133val_133 +137val_137 +137val_137 +155val_155 +162val_162 +166val_166 +177val_177 +180val_180 +191val_191 +191val_191 +195val_195 +195val_195 +199val_199 +199val_199 +199val_199 +203val_203 +203val_203 +207val_207 +207val_207 +214val_214 +218val_218 +221val_221 +221val_221 +229val_229 +229val_229 +247val_247 +258val_258 +265val_265 +265val_265 +27val_27 +272val_272 +272val_272 +283val_283 +287val_287 +298val_298 +298val_298 +298val_298 +30val_30 +302val_302 +306val_306 +317val_317 +317val_317 +331val_331 +331val_331 +335val_335 +339val_339 +34val_34 +342val_342 +342val_342 +353val_353 +353val_353 +360val_360 +364val_364 +368val_368 +375val_375 +379val_379 +382val_382 +382val_382 +386val_386 +393val_393 +397val_397 +397val_397 +401val_401 +401val_401 +401val_401 +401val_401 +401val_401 +409val_409 +409val_409 +409val_409 +41val_41 +427val_427 +430val_430 +430val_430 +430val_430 +438val_438 +438val_438 +438val_438 +449val_449 +452val_452 +463val_463 +463val_463 +467val_467 +470val_470 +478val_478 +478val_478 +481val_481 +485val_485 +489val_489 +489val_489 +489val_489 +489val_489 +492val_492 +492val_492 +496val_496 +5val_5 +5val_5 +5val_5 +67val_67 +67val_67 +70val_70 +70val_70 +70val_70 +74val_74 +78val_78 +85val_85 +9val_9 +92val_92 +96val_96 Index: data/files/srcsortbucket3outof4.txt =================================================================== --- data/files/srcsortbucket3outof4.txt (revision 0) +++ data/files/srcsortbucket3outof4.txt (working copy) @@ -0,0 +1,124 @@ +105val_105 +116val_116 +134val_134 +134val_134 +138val_138 +138val_138 +138val_138 +138val_138 +145val_145 +149val_149 +149val_149 +152val_152 +152val_152 +156val_156 +163val_163 +167val_167 +167val_167 +167val_167 +17val_17 +170val_170 +174val_174 +174val_174 +178val_178 +181val_181 +189val_189 +192val_192 +196val_196 +2val_2 +20val_20 +200val_200 +200val_200 +208val_208 +208val_208 +208val_208 +219val_219 +219val_219 +222val_222 +226val_226 +233val_233 +233val_233 +237val_237 +237val_237 +24val_24 +24val_24 +244val_244 +248val_248 +255val_255 +255val_255 +262val_262 +266val_266 +273val_273 +273val_273 +273val_273 +277val_277 +277val_277 +277val_277 +277val_277 +28val_28 +280val_280 +280val_280 +284val_284 +288val_288 +288val_288 +291val_291 +307val_307 +307val_307 +310val_310 +318val_318 +318val_318 +318val_318 +321val_321 +321val_321 +325val_325 +325val_325 +332val_332 +336val_336 +35val_35 +35val_35 +35val_35 +365val_365 +369val_369 +369val_369 +369val_369 +394val_394 +402val_402 +406val_406 +406val_406 +406val_406 +406val_406 +413val_413 +413val_413 +417val_417 +417val_417 +417val_417 +42val_42 +42val_42 +424val_424 +424val_424 +431val_431 +431val_431 +431val_431 +435val_435 +439val_439 +439val_439 +446val_446 +453val_453 +457val_457 +460val_460 +468val_468 +468val_468 +468val_468 +468val_468 +475val_475 +479val_479 +482val_482 +493val_493 +497val_497 +53val_53 +57val_57 +64val_64 +82val_82 +86val_86 +97val_97 +97val_97 Index: data/files/srcsortbucket4outof4.txt =================================================================== --- data/files/srcsortbucket4outof4.txt (revision 0) +++ data/files/srcsortbucket4outof4.txt (working copy) @@ -0,0 +1,138 @@ +10val_10 +113val_113 +113val_113 +120val_120 +120val_120 +128val_128 +128val_128 +128val_128 +131val_131 +146val_146 +146val_146 +153val_153 +157val_157 +160val_160 +164val_164 +164val_164 +168val_168 +175val_175 +175val_175 +179val_179 +179val_179 +18val_18 +18val_18 +186val_186 +193val_193 +193val_193 +193val_193 +197val_197 +197val_197 +201val_201 +205val_205 +205val_205 +209val_209 +209val_209 +216val_216 +216val_216 +223val_223 +223val_223 +230val_230 +230val_230 +230val_230 +230val_230 +230val_230 +238val_238 +238val_238 +241val_241 +249val_249 +252val_252 +256val_256 +256val_256 +263val_263 +274val_274 +278val_278 +278val_278 +281val_281 +281val_281 +285val_285 +289val_289 +292val_292 +296val_296 +308val_308 +311val_311 +311val_311 +311val_311 +315val_315 +322val_322 +322val_322 +333val_333 +333val_333 +344val_344 +344val_344 +348val_348 +348val_348 +348val_348 +348val_348 +348val_348 +351val_351 +362val_362 +366val_366 +373val_373 +377val_377 +384val_384 +384val_384 +384val_384 +395val_395 +395val_395 +399val_399 +399val_399 +403val_403 +403val_403 +403val_403 +407val_407 +414val_414 +414val_414 +418val_418 +421val_421 +429val_429 +429val_429 +43val_43 +432val_432 +436val_436 +443val_443 +454val_454 +454val_454 +454val_454 +458val_458 +458val_458 +469val_469 +469val_469 +469val_469 +469val_469 +469val_469 +47val_47 +472val_472 +483val_483 +487val_487 +490val_490 +494val_494 +498val_498 +498val_498 +498val_498 +54val_54 +58val_58 +58val_58 +65val_65 +69val_69 +72val_72 +72val_72 +76val_76 +76val_76 +83val_83 +83val_83 +87val_87 +90val_90 +90val_90 +90val_90 +98val_98 +98val_98 Index: ql/src/test/results/clientpositive/bucketmapjoin5.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketmapjoin5.q.out (revision 1366995) +++ ql/src/test/results/clientpositive/bucketmapjoin5.q.out (working copy) @@ -163,7 +163,7 @@ Position of Big Table: 1 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - a {srcbucket20.txt=[srcbucket20.txt], srcbucket21.txt=[srcbucket21.txt], srcbucket22.txt=[srcbucket20.txt], srcbucket23.txt=[srcbucket21.txt], ds=2008-04-09/srcbucket20.txt=[srcbucket20.txt], ds=2008-04-09/srcbucket21.txt=[srcbucket21.txt], ds=2008-04-09/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-09/srcbucket23.txt=[srcbucket21.txt]} + a {ds=2008-04-08/srcbucket20.txt=[srcbucket20.txt], ds=2008-04-08/srcbucket21.txt=[srcbucket21.txt], ds=2008-04-08/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-08/srcbucket23.txt=[srcbucket21.txt], ds=2008-04-09/srcbucket20.txt=[srcbucket20.txt], ds=2008-04-09/srcbucket21.txt=[srcbucket21.txt], ds=2008-04-09/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-09/srcbucket23.txt=[srcbucket21.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: @@ -615,7 +615,7 @@ Position of Big Table: 1 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - a {srcbucket22.txt=[srcbucket20.txt], srcbucket23.txt=[srcbucket21.txt], ds=2008-04-09/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-09/srcbucket23.txt=[srcbucket21.txt]} + a {ds=2008-04-08/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-08/srcbucket23.txt=[srcbucket21.txt], ds=2008-04-09/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-09/srcbucket23.txt=[srcbucket21.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: Index: ql/src/test/results/clientpositive/bucketcontext_2.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketcontext_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketcontext_2.q.out (working copy) @@ -0,0 +1,546 @@ +PREHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-08/srcsortbucket3outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-08/srcsortbucket4outof4.txt], ds=2008-04-09/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-08/srcsortbucket3outof4.txt], ds=2008-04-09/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-08/srcsortbucket4outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +928 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-08/srcsortbucket3outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-08/srcsortbucket4outof4.txt], ds=2008-04-09/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-08/srcsortbucket3outof4.txt], ds=2008-04-09/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-08/srcsortbucket4outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + Index: ql/src/test/results/clientpositive/bucketmapjoin2.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketmapjoin2.q.out (revision 1366995) +++ ql/src/test/results/clientpositive/bucketmapjoin2.q.out (working copy) @@ -108,7 +108,7 @@ Position of Big Table: 0 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - b {srcbucket20.txt=[srcbucket22.txt], srcbucket21.txt=[srcbucket23.txt], srcbucket22.txt=[srcbucket22.txt], srcbucket23.txt=[srcbucket23.txt]} + b {ds=2008-04-08/srcbucket20.txt=[ds=2008-04-08/srcbucket22.txt], ds=2008-04-08/srcbucket21.txt=[ds=2008-04-08/srcbucket23.txt], ds=2008-04-08/srcbucket22.txt=[ds=2008-04-08/srcbucket22.txt], ds=2008-04-08/srcbucket23.txt=[ds=2008-04-08/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: @@ -518,7 +518,7 @@ Position of Big Table: 1 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - a {srcbucket22.txt=[srcbucket20.txt, srcbucket22.txt], srcbucket23.txt=[srcbucket21.txt, srcbucket23.txt]} + a {ds=2008-04-08/srcbucket22.txt=[ds=2008-04-08/srcbucket20.txt, ds=2008-04-08/srcbucket22.txt], ds=2008-04-08/srcbucket23.txt=[ds=2008-04-08/srcbucket21.txt, ds=2008-04-08/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: @@ -1112,7 +1112,7 @@ Position of Big Table: 0 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - b {srcbucket20.txt=[srcbucket22.txt, srcbucket22.txt], srcbucket21.txt=[srcbucket23.txt, srcbucket23.txt], srcbucket22.txt=[srcbucket22.txt, srcbucket22.txt], srcbucket23.txt=[srcbucket23.txt, srcbucket23.txt]} + b {ds=2008-04-08/srcbucket20.txt=[ds=2008-04-08/srcbucket22.txt, ds=2008-04-09/srcbucket22.txt], ds=2008-04-08/srcbucket21.txt=[ds=2008-04-08/srcbucket23.txt, ds=2008-04-09/srcbucket23.txt], ds=2008-04-08/srcbucket22.txt=[ds=2008-04-08/srcbucket22.txt, ds=2008-04-09/srcbucket22.txt], ds=2008-04-08/srcbucket23.txt=[ds=2008-04-08/srcbucket23.txt, ds=2008-04-09/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: Index: ql/src/test/results/clientpositive/bucketcontext_4.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketcontext_4.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketcontext_4.q.out (working copy) @@ -0,0 +1,478 @@ +PREHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-08/srcsortbucket3outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket3outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-08/srcsortbucket4outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket4outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +928 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-08/srcsortbucket3outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket3outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-08/srcsortbucket4outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket4outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 2 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + Index: ql/src/test/results/clientpositive/bucketcontext_1.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketcontext_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketcontext_1.q.out (working copy) @@ -0,0 +1,540 @@ +PREHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 1 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-09 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt], ds=2008-04-08/srcsortbucket3outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket4outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt], ds=2008-04-09/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt], ds=2008-04-09/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt], ds=2008-04-09/srcsortbucket3outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt], ds=2008-04-09/srcsortbucket4outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +928 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Sorted Merge Bucket Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +928 Index: ql/src/test/results/clientpositive/bucketmapjoin1.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketmapjoin1.q.out (revision 1366995) +++ ql/src/test/results/clientpositive/bucketmapjoin1.q.out (working copy) @@ -377,7 +377,7 @@ Position of Big Table: 0 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - b {srcbucket20.txt=[srcbucket20.txt, srcbucket22.txt], srcbucket21.txt=[srcbucket21.txt, srcbucket23.txt]} + b {srcbucket20.txt=[ds=2008-04-08/srcbucket20.txt, ds=2008-04-08/srcbucket22.txt], srcbucket21.txt=[ds=2008-04-08/srcbucket21.txt, ds=2008-04-08/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: @@ -785,7 +785,7 @@ Position of Big Table: 1 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - a {srcbucket20.txt=[srcbucket20.txt], srcbucket21.txt=[srcbucket21.txt], srcbucket22.txt=[srcbucket20.txt], srcbucket23.txt=[srcbucket21.txt]} + a {ds=2008-04-08/srcbucket20.txt=[srcbucket20.txt], ds=2008-04-08/srcbucket21.txt=[srcbucket21.txt], ds=2008-04-08/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-08/srcbucket23.txt=[srcbucket21.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: Index: ql/src/test/results/clientpositive/bucketcontext_3.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketcontext_3.q.out (revision 0) +++ ql/src/test/results/clientpositive/bucketcontext_3.q.out (working copy) @@ -0,0 +1,466 @@ +PREHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_small +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_small@ds=2008-04-09 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_small@ds=2008-04-09 +PREHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@bucket_big +PREHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +PREHOOK: type: LOAD +PREHOOK: Output: default@bucket_big@ds=2008-04-08 +POSTHOOK: query: load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08') +POSTHOOK: type: LOAD +POSTHOOK: Output: default@bucket_big@ds=2008-04-08 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt], ds=2008-04-08/srcsortbucket3outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket4outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +PREHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +POSTHOOK: Input: default@bucket_small@ds=2008-04-09 +#### A masked pattern was here #### +928 +PREHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME bucket_small) a) (TOK_TABREF (TOK_TABNAME bucket_big) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (TOK_FUNCTIONSTAR count))))) + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt], ds=2008-04-08/srcsortbucket3outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket4outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + SORTBUCKETCOLSPREFIX TRUE + bucket_count 4 + bucket_field_name key + columns key,value + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + Select Operator + Group By Operator + aggregations: + expr: count() + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Needs Tagging: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10002 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns + columns.types + escape.delim \ + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + serialization.format 1 + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + + Index: ql/src/test/results/clientpositive/bucketmapjoin3.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketmapjoin3.q.out (revision 1366995) +++ ql/src/test/results/clientpositive/bucketmapjoin3.q.out (working copy) @@ -125,7 +125,7 @@ Position of Big Table: 0 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - b {srcbucket22.txt=[srcbucket20.txt, srcbucket22.txt], srcbucket23.txt=[srcbucket21.txt, srcbucket23.txt]} + b {ds=2008-04-08/srcbucket22.txt=[ds=2008-04-08/srcbucket20.txt, ds=2008-04-08/srcbucket22.txt], ds=2008-04-08/srcbucket23.txt=[ds=2008-04-08/srcbucket21.txt, ds=2008-04-08/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: @@ -535,7 +535,7 @@ Position of Big Table: 1 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - a {srcbucket20.txt=[srcbucket22.txt], srcbucket21.txt=[srcbucket23.txt], srcbucket22.txt=[srcbucket22.txt], srcbucket23.txt=[srcbucket23.txt]} + a {ds=2008-04-08/srcbucket20.txt=[ds=2008-04-08/srcbucket22.txt], ds=2008-04-08/srcbucket21.txt=[ds=2008-04-08/srcbucket23.txt], ds=2008-04-08/srcbucket22.txt=[ds=2008-04-08/srcbucket22.txt], ds=2008-04-08/srcbucket23.txt=[ds=2008-04-08/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: Index: ql/src/test/results/clientpositive/bucketmapjoin_negative2.q.out =================================================================== --- ql/src/test/results/clientpositive/bucketmapjoin_negative2.q.out (revision 1366995) +++ ql/src/test/results/clientpositive/bucketmapjoin_negative2.q.out (working copy) @@ -98,7 +98,7 @@ Position of Big Table: 0 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - b {srcbucket20.txt=[srcbucket22.txt, srcbucket22.txt], srcbucket21.txt=[srcbucket23.txt, srcbucket23.txt]} + b {srcbucket20.txt=[ds=2008-04-08/srcbucket22.txt, ds=2008-04-09/srcbucket22.txt], srcbucket21.txt=[ds=2008-04-08/srcbucket23.txt, ds=2008-04-09/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: Index: ql/src/test/results/clientpositive/stats11.q.out =================================================================== --- ql/src/test/results/clientpositive/stats11.q.out (revision 1366995) +++ ql/src/test/results/clientpositive/stats11.q.out (working copy) @@ -125,7 +125,7 @@ Position of Big Table: 0 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - b {srcbucket20.txt=[srcbucket20.txt, srcbucket22.txt], srcbucket21.txt=[srcbucket21.txt, srcbucket23.txt]} + b {srcbucket20.txt=[ds=2008-04-08/srcbucket20.txt, ds=2008-04-08/srcbucket22.txt], srcbucket21.txt=[ds=2008-04-08/srcbucket21.txt, ds=2008-04-08/srcbucket23.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: @@ -533,7 +533,7 @@ Position of Big Table: 1 Bucket Mapjoin Context: Alias Bucket Base File Name Mapping: - a {srcbucket20.txt=[srcbucket20.txt], srcbucket21.txt=[srcbucket21.txt], srcbucket22.txt=[srcbucket20.txt], srcbucket23.txt=[srcbucket21.txt]} + a {ds=2008-04-08/srcbucket20.txt=[srcbucket20.txt], ds=2008-04-08/srcbucket21.txt=[srcbucket21.txt], ds=2008-04-08/srcbucket22.txt=[srcbucket20.txt], ds=2008-04-08/srcbucket23.txt=[srcbucket21.txt]} Alias Bucket File Name Mapping: #### A masked pattern was here #### Alias Bucket Output File Name Mapping: Index: ql/src/test/queries/clientpositive/bucketcontext_1.q =================================================================== --- ql/src/test/queries/clientpositive/bucketcontext_1.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketcontext_1.q (working copy) @@ -0,0 +1,24 @@ +-- small 1 part, 2 bucket & big 2 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.optimize.bucketmapjoin = true; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/bucketcontext_3.q =================================================================== --- ql/src/test/queries/clientpositive/bucketcontext_3.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketcontext_3.q (working copy) @@ -0,0 +1,22 @@ +-- small 2 part, 2 bucket & big 1 part, 4 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +set hive.optimize.bucketmapjoin = true; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + Index: ql/src/test/queries/clientpositive/bucketcontext_2.q =================================================================== --- ql/src/test/queries/clientpositive/bucketcontext_2.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketcontext_2.q (working copy) @@ -0,0 +1,22 @@ +-- small 1 part, 4 bucket & big 2 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-09'); + +set hive.optimize.bucketmapjoin = true; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + Index: ql/src/test/queries/clientpositive/bucketcontext_4.q =================================================================== --- ql/src/test/queries/clientpositive/bucketcontext_4.q (revision 0) +++ ql/src/test/queries/clientpositive/bucketcontext_4.q (working copy) @@ -0,0 +1,25 @@ +-- small 2 part, 4 bucket & big 1 part, 2 bucket +CREATE TABLE bucket_small (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-08'); + +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket3outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); +load data local inpath '../data/files/srcsortbucket4outof4.txt' INTO TABLE bucket_small partition(ds='2008-04-09'); + +CREATE TABLE bucket_big (key string, value string) partitioned by (ds string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE; +load data local inpath '../data/files/srcsortbucket1outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); +load data local inpath '../data/files/srcsortbucket2outof4.txt' INTO TABLE bucket_big partition(ds='2008-04-08'); + +set hive.optimize.bucketmapjoin = true; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; +select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +set hive.optimize.bucketmapjoin.sortedmerge = true; +set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; +explain extended select /* + MAPJOIN(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + + Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedMergeBucketMapJoinOptimizer.java (working copy) @@ -194,6 +194,15 @@ if (tso == null) { return false; } + if (pos != op.getConf().getPosBigTable()) { + // currently, a file from a big table can be joined with only 1 file from a small table + for (List files : + op.getConf().getAliasBucketFileNameMapping().get(alias).values()) { + if (files != null && files.size() > 1) { + return false; + } + } + } List keys = op.getConf().getKeys().get((byte) pos); // get all join columns from join keys stored in MapJoinDesc @@ -229,32 +238,21 @@ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } - int partNumber = prunedParts.getConfirmedPartns().size() - + prunedParts.getUnknownPartns().size(); - if (partNumber > 1) { - return false; - } - boolean ret = true; - for (Partition p : prunedParts.getConfirmedPartns()) { - ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols); - if (!ret) { + for (Partition partition : prunedParts.getNotDeniedPartns()) { + if (!checkSortColsAndJoinCols(partition.getSortCols(), joinCols)) { return false; } } - for (Partition p : prunedParts.getUnknownPartns()) { - ret = ret && checkSortColsAndJoinCols(p.getSortCols(), joinCols); - if (!ret) { - return false; - } - } - } else { - return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols); + return true; } - return true; + return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols); } private boolean checkSortColsAndJoinCols(List sortCols, List joinCols) { + if (sortCols == null || sortCols.size() != joinCols.size()) { + return false; + } // require all sort columns are asc, right now only support asc List sortColNames = new ArrayList(); for (Order o : sortCols) { @@ -264,8 +262,7 @@ sortColNames.add(o.getCol()); } - return sortColNames.containsAll(joinCols) - && sortColNames.size() == joinCols.size(); + return sortColNames.containsAll(joinCols); } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (working copy) @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; @@ -194,6 +195,7 @@ LinkedHashMap> bigTblPartsToBucketFileNames = new LinkedHashMap>(); LinkedHashMap bigTblPartsToBucketNumber = new LinkedHashMap(); + boolean bigTablePartitioned = true; for (int index = 0; index < joinAliases.size(); index++) { String alias = joinAliases.get(index); TableScanOperator tso = (TableScanOperator) topOps.get(alias); @@ -218,7 +220,12 @@ } List partitions = prunedParts.getNotDeniedPartns(); // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number) - if (partitions.size() >= 1) { + if (partitions.isEmpty()) { + if (!alias.equals(baseBigAlias)) { + aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList()); + aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>()); + } + } else { List buckets = new ArrayList(); List> files = new ArrayList>(); for (Partition p : partitions) { @@ -238,11 +245,6 @@ aliasToPartitionBucketNumberMapping.put(alias, buckets); aliasToPartitionBucketFileNamesMapping.put(alias, files); } - } else { - if (!alias.equals(baseBigAlias)) { - aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList()); - aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList>()); - } } } else { if (!checkBucketColumns(tbl.getBucketCols(), mjDecs, index)) { @@ -253,6 +255,7 @@ if (alias.equals(baseBigAlias)) { bigTblPartsToBucketFileNames.put(null, fileNames); bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets()); + bigTablePartitioned = false; } else { aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num)); aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames)); @@ -271,8 +274,8 @@ MapJoinDesc desc = mapJoinOp.getConf(); - LinkedHashMap>> aliasBucketFileNameMapping = - new LinkedHashMap>>(); + Map>> aliasBucketFileNameMapping = + new LinkedHashMap>>(); //sort bucket names for the big table for(List partBucketNames : bigTblPartsToBucketFileNames.values()) { @@ -292,7 +295,7 @@ List smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias); List> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias); - LinkedHashMap> mapping = new LinkedHashMap>(); + Map> mapping = new LinkedHashMap>(); aliasBucketFileNameMapping.put(alias, mapping); // for each bucket file in big table, get the corresponding bucket file @@ -307,21 +310,33 @@ int bigTblBucketNum = bigTblPartToBucketNum.next().getValue(); List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue(); fillMapping(smallTblBucketNums, smallTblFilesList, - mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBucketFileNameMapping()); + mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping()); } } desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping); desc.setBigTableAlias(baseBigAlias); + if (bigTablePartitioned) { + desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames)); + } return null; } + // convert partition to partition spec string + private Map> convert(Map> mapping) { + Map> converted = new HashMap>(); + for (Map.Entry> entry : mapping.entrySet()) { + converted.put(entry.getKey().getName(), entry.getValue()); + } + return converted; + } + // called for each partition of big table and populates mapping for each file in the partition private void fillMapping( List smallTblBucketNums, List> smallTblFilesList, - LinkedHashMap> mapping, + Map> mapping, int bigTblBucketNum, List bigTblBucketNameList, - LinkedHashMap bucketFileNameMapping) { + Map bucketFileNameMapping) { for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) { ArrayList resultFileNames = new ArrayList(); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (working copy) @@ -58,6 +58,7 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.RowResolver; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; @@ -69,7 +70,6 @@ import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; -import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext; /** * General utility common functions for the Processor to convert operator into @@ -239,7 +239,7 @@ private static void setupBucketMapJoinInfo(MapredWork plan, AbstractMapJoinOperator currMapJoinOp, boolean createLocalPlan) { if (currMapJoinOp != null) { - LinkedHashMap>> aliasBucketFileNameMapping = + Map>> aliasBucketFileNameMapping = currMapJoinOp.getConf().getAliasBucketFileNameMapping(); if(aliasBucketFileNameMapping!= null) { MapredLocalWork localPlan = plan.getMapLocalWork(); @@ -276,10 +276,12 @@ BucketMapJoinContext bucketMJCxt = new BucketMapJoinContext(); localPlan.setBucketMapjoinContext(bucketMJCxt); bucketMJCxt.setAliasBucketFileNameMapping(aliasBucketFileNameMapping); - bucketMJCxt.setBucketFileNameMapping(currMapJoinOp.getConf().getBucketFileNameMapping()); + bucketMJCxt.setBucketFileNameMapping(currMapJoinOp.getConf().getBigTableBucketNumMapping()); localPlan.setInputFileChangeSensitive(true); bucketMJCxt.setMapJoinBigTableAlias(currMapJoinOp.getConf().getBigTableAlias()); bucketMJCxt.setBucketMatcherClass(org.apache.hadoop.hive.ql.exec.DefaultBucketMatcher.class); + bucketMJCxt.setBigTablePartSpecToFileMapping( + currMapJoinOp.getConf().getBigTablePartSpecToFileMapping()); } } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MapJoinResolver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MapJoinResolver.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MapJoinResolver.java (working copy) @@ -129,6 +129,7 @@ newLocalWork.setDummyParentOp(dummyOps); newLocalWork.setTmpFileURI(tmpFileURI); newLocalWork.setInputFileChangeSensitive(localwork.getInputFileChangeSensitive()); + newLocalWork.setBucketMapjoinContext(localwork.copyPartSpecMappingOnly()); mapredWork.setMapLocalWork(newLocalWork); // get all parent tasks List> parentTasks = currTask.getParentTasks(); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java (working copy) @@ -399,12 +399,13 @@ HashMapWrapper hashTable = hashTables.getValue(); // get current input file name - String bigBucketFileName = this.getExecContext().getCurrentBigBucketFile(); - if (bigBucketFileName == null || bigBucketFileName.length() == 0) { - bigBucketFileName = "-"; - } + String bigBucketFileName = getExecContext().getCurrentBigBucketFile(); + + String fileName = getExecContext().getLocalWork().getBucketFileName(bigBucketFileName); + // get the tmp URI path; it will be a hdfs path if not local mode - String tmpURIPath = Utilities.generatePath(tmpURI, conf.getDumpFilePrefix(), tag, bigBucketFileName); + String dumpFilePrefix = conf.getDumpFilePrefix(); + String tmpURIPath = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName); hashTable.isAbort(rowNumber, console); console.printInfo(Utilities.now() + "\tDump the hashtable into file: " + tmpURIPath); // get the hashtable file and path Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (working copy) @@ -399,7 +399,7 @@ int filesIdx = 0; Set seenBuckets = new HashSet(); for (int idx = 0; idx < totalFiles; idx++) { - if (this.getExecContext() != null && this.getExecContext().getFileId() != -1) { + if (this.getExecContext() != null && this.getExecContext().getFileId() != null) { LOG.info("replace taskId from execContext "); taskId = Utilities.replaceTaskIdFromFilename(taskId, this.getExecContext().getFileId()); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapJoinOperator.java (working copy) @@ -159,17 +159,11 @@ boolean localMode = HiveConf.getVar(hconf, HiveConf.ConfVars.HADOOPJT).equals("local"); String baseDir = null; - String currentInputFile = HiveConf.getVar(hconf, HiveConf.ConfVars.HADOOPMAPFILENAME); + String currentInputFile = getExecContext().getCurrentInputFile(); LOG.info("******* Load from HashTable File: input : " + currentInputFile); - String currentFileName; + String fileName = getExecContext().getLocalWork().getBucketFileName(currentInputFile); - if (this.getExecContext().getLocalWork().getInputFileChangeSensitive()) { - currentFileName = this.getFileName(currentInputFile); - } else { - currentFileName = "-"; - } - try { if (localMode) { baseDir = this.getExecContext().getLocalWork().getTmpFileURI(); @@ -193,7 +187,7 @@ .entrySet()) { Byte pos = entry.getKey(); HashMapWrapper hashtable = entry.getValue(); - String filePath = Utilities.generatePath(baseDir, conf.getDumpFilePrefix(), pos, currentFileName); + String filePath = Utilities.generatePath(baseDir, conf.getDumpFilePrefix(), pos, fileName); Path path = new Path(filePath); LOG.info("\tLoad back 1 hashtable file from tmp file uri:" + path.toString()); hashtable.initilizePersistentHash(path.toUri().getPath()); @@ -288,17 +282,6 @@ } } - private String getFileName(String path) { - if (path == null || path.length() == 0) { - return null; - } - - int last_separator = path.lastIndexOf(Path.SEPARATOR) + 1; - String fileName = path.substring(last_separator); - return fileName; - - } - @Override public void closeOp(boolean abort) throws HiveException { Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExecMapperContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExecMapperContext.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExecMapperContext.java (working copy) @@ -25,8 +25,8 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOContext; +import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; -import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.ReflectionUtils; @@ -43,7 +43,11 @@ // so it won't be updated. private String currentInputFile = null; private boolean inputFileChecked = false; - private Integer fileId = new Integer(-1); + + // for SMB join, replaced with number part of task-id , making output file name + // if big alias is not partitioned table, it's bucket number + // if big alias is partitioned table, it's partition spec + bucket number + private String fileId = null; private MapredLocalWork localWork = null; private Map fetchOperators; private JobConf jc; @@ -147,11 +151,11 @@ this.localWork = localWork; } - public Integer getFileId() { + public String getFileId() { return fileId; } - public void setFileId(Integer fileId) { + public void setFileId(String fileId) { this.fileId = fileId; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (working copy) @@ -1142,6 +1142,12 @@ private static Pattern fileNameTaskIdRegex = Pattern.compile("^.*?([0-9]+)(_[0-9]{1,3})?(\\..*)?$"); /** + * This retruns prefix part + taskID for bucket join for partitioned table + */ + private static Pattern fileNamePrefixedTaskIdRegex = + Pattern.compile("^.*?((\\(.*\\))?[0-9]+)(_[0-9]{1,3})?(\\..*)?$"); + + /** * Get the task id from the filename. It is assumed that the filename is derived from the output * of getTaskId * @@ -1149,13 +1155,28 @@ * filename to extract taskid from */ public static String getTaskIdFromFilename(String filename) { + return getIdFromFilename(filename, fileNameTaskIdRegex); + } + + /** + * Get the part-spec + task id from the filename. It is assumed that the filename is derived + * from the output of getTaskId + * + * @param filename + * filename to extract taskid from + */ + public static String getPrefixedTaskIdFromFilename(String filename) { + return getIdFromFilename(filename, fileNamePrefixedTaskIdRegex); + } + + private static String getIdFromFilename(String filename, Pattern pattern) { String taskId = filename; int dirEnd = filename.lastIndexOf(Path.SEPARATOR); if (dirEnd != -1) { taskId = filename.substring(dirEnd + 1); } - Matcher m = fileNameTaskIdRegex.matcher(taskId); + Matcher m = pattern.matcher(taskId); if (!m.matches()) { LOG.warn("Unable to get task id from file name: " + filename + ". Using last component" + taskId + " as task id."); @@ -1174,14 +1195,21 @@ * filename to replace taskid "0_0" or "0_0.gz" by 33 to "33_0" or "33_0.gz" */ public static String replaceTaskIdFromFilename(String filename, int bucketNum) { + return replaceTaskIdFromFilename(filename, String.valueOf(bucketNum)); + } + + public static String replaceTaskIdFromFilename(String filename, String fileId) { String taskId = getTaskIdFromFilename(filename); - String newTaskId = replaceTaskId(taskId, bucketNum); + String newTaskId = replaceTaskId(taskId, fileId); String ret = replaceTaskIdFromFilename(filename, taskId, newTaskId); return (ret); } private static String replaceTaskId(String taskId, int bucketNum) { - String strBucketNum = String.valueOf(bucketNum); + return replaceTaskId(taskId, String.valueOf(bucketNum)); + } + + private static String replaceTaskId(String taskId, String strBucketNum) { int bucketNumLen = strBucketNum.length(); int taskIdLen = taskId.length(); StringBuffer s = new StringBuffer(); @@ -1407,7 +1435,7 @@ throw new IOException("Unable to delete tmp file: " + one.getPath()); } } else { - String taskId = getTaskIdFromFilename(one.getPath().getName()); + String taskId = getPrefixedTaskIdFromFilename(one.getPath().getName()); FileStatus otherFile = taskIdToFile.get(taskId); if (otherFile == null) { taskIdToFile.put(taskId, one); Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SMBMapJoinOperator.java (working copy) @@ -31,11 +31,11 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.persistence.RowContainer; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; -import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; @@ -475,23 +475,22 @@ } private void setUpFetchOpContext(FetchOperator fetchOp, String alias) { - String currentInputFile = this.getExecContext().getCurrentInputFile(); - BucketMapJoinContext bucketMatcherCxt = this.localWork - .getBucketMapjoinContext(); + String currentInputFile = getExecContext().getCurrentInputFile(); + BucketMapJoinContext bucketMatcherCxt = localWork.getBucketMapjoinContext(); + Class bucketMatcherCls = bucketMatcherCxt .getBucketMatcherClass(); BucketMatcher bucketMatcher = (BucketMatcher) ReflectionUtils.newInstance( bucketMatcherCls, null); - Integer bucketNum = bucketMatcherCxt.getBucketFileNameMapping().get(currentInputFile); - if (bucketNum != null) { - this.getExecContext().setFileId(bucketNum); - } - LOG.info("set task id: " + this.getExecContext().getFileId()); + getExecContext().setFileId(bucketMatcherCxt.createFileId(currentInputFile)); + LOG.info("set task id: " + getExecContext().getFileId()); + bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt .getAliasBucketFileNameMapping()); List aliasFiles = bucketMatcher.getAliasBucketFiles(currentInputFile, bucketMatcherCxt.getMapJoinBigTableAlias(), alias); + Iterator iter = aliasFiles.iterator(); fetchOp.setupContext(iter, null); } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/BucketMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/BucketMatcher.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/BucketMatcher.java (working copy) @@ -18,9 +18,8 @@ package org.apache.hadoop.hive.ql.exec; -import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import org.apache.hadoop.fs.Path; @@ -29,9 +28,9 @@ public List getAliasBucketFiles(String currentInputFile, String refTableAlias, String alias); public void setAliasBucketFileNameMapping( - LinkedHashMap>> aliasBucketFileNameMapping); + Map>> aliasBucketFileNameMapping); - public LinkedHashMap getBucketFileNameMapping(); + public Map getBucketFileNameMapping(); - public void setBucketFileNameMapping(LinkedHashMap bucketFileNameMapping); + public void setBucketFileNameMapping(Map bucketFileNameMapping); } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/MapredLocalTask.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapredLocalTask.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapredLocalTask.java (working copy) @@ -29,7 +29,6 @@ import java.util.Collection; import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Properties; @@ -50,9 +49,9 @@ import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectValue; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; -import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext; import org.apache.hadoop.hive.ql.plan.api.StageType; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; @@ -267,7 +266,7 @@ initializeOperators(fetchOpJobConfMap); // for each big table's bucket, call the start forward if (inputFileChangeSenstive) { - for (LinkedHashMap> bigTableBucketFiles : work + for (Map> bigTableBucketFiles : work .getBucketMapjoinContext().getAliasBucketFileNameMapping().values()) { for (String bigTableBucket : bigTableBucketFiles.keySet()) { startForward(inputFileChangeSenstive, bigTableBucket); @@ -308,7 +307,7 @@ if (fetchOp.isEmptyTable()) { //generate empty hashtable for empty table - this.generateDummyHashTable(alias, getFileName(bigTableBucket)); + this.generateDummyHashTable(alias, bigTableBucket); continue; } @@ -319,8 +318,7 @@ InspectableObject row = fetchOp.getNextRow(); if (row == null) { if (inputFileChangeSenstive) { - String fileName = this.getFileName(bigTableBucket); - execContext.setCurrentBigBucketFile(fileName); + execContext.setCurrentBigBucketFile(bigTableBucket); forwardOp.reset(); } forwardOp.close(false); @@ -406,12 +404,11 @@ HashMapWrapper hashTable = new HashMapWrapper(); - if (bigBucketFileName == null || bigBucketFileName.length() == 0) { - bigBucketFileName = "-"; - } + String fileName = work.getBucketFileName(bigBucketFileName); + HashTableSinkOperator htso = (HashTableSinkOperator)childOp; String tmpURIPath = Utilities.generatePath(tmpURI, htso.getConf().getDumpFilePrefix(), - tag, bigBucketFileName); + tag, fileName); console.printInfo(Utilities.now() + "\tDump the hashtable into file: " + tmpURIPath); Path path = new Path(tmpURIPath); FileSystem fs = path.getFileSystem(job); @@ -439,17 +436,6 @@ fetchOp.setupContext(iter, null); } - private String getFileName(String path) { - if (path == null || path.length() == 0) { - return null; - } - - int last_separator = path.lastIndexOf(Path.SEPARATOR) + 1; - String fileName = path.substring(last_separator); - return fileName; - - } - @Override public void localizeMRTmpFilesImpl(Context ctx) { Index: ql/src/java/org/apache/hadoop/hive/ql/exec/DefaultBucketMatcher.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/DefaultBucketMatcher.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/DefaultBucketMatcher.java (working copy) @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -32,9 +33,9 @@ protected Log LOG = LogFactory.getLog(this.getClass().getName()); //MAPPING: bucket_file_name_in_big_table->{alias_table->corresonding_bucket_file_names} - private LinkedHashMap>> aliasBucketMapping; + private Map>> aliasBucketMapping; - private LinkedHashMap bucketFileNameMapping; + private Map bucketFileNameMapping; public DefaultBucketMatcher(){ bucketFileNameMapping = new LinkedHashMap(); @@ -53,15 +54,15 @@ } public void setAliasBucketFileNameMapping( - LinkedHashMap>> aliasBucketFileNameMapping) { + Map>> aliasBucketFileNameMapping) { this.aliasBucketMapping = aliasBucketFileNameMapping; } - public LinkedHashMap getBucketFileNameMapping() { + public Map getBucketFileNameMapping() { return bucketFileNameMapping; } - public void setBucketFileNameMapping(LinkedHashMap bucketFileNameMapping) { + public void setBucketFileNameMapping(Map bucketFileNameMapping) { this.bucketFileNameMapping = bucketFileNameMapping; } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/HashTableSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/HashTableSinkDesc.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/HashTableSinkDesc.java (working copy) @@ -22,7 +22,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -75,17 +74,14 @@ private Map> retainList; - private transient String bigTableAlias; - - private LinkedHashMap>> aliasBucketFileNameMapping; - private LinkedHashMap bucketFileNameMapping; + private transient BucketMapJoinContext bucketMapjoinContext; private float hashtableMemoryUsage; //map join dump file name private String dumpFilePrefix; public HashTableSinkDesc() { - bucketFileNameMapping = new LinkedHashMap(); + bucketMapjoinContext = new BucketMapJoinContext(); } public HashTableSinkDesc(MapJoinDesc clone) { @@ -109,10 +105,8 @@ this.valueTblFilteredDescs = clone.getValueFilteredTblDescs(); this.posBigTable = clone.getPosBigTable(); this.retainList = clone.getRetainList(); - this.bigTableAlias = clone.getBigTableAlias(); - this.aliasBucketFileNameMapping = clone.getAliasBucketFileNameMapping(); - this.bucketFileNameMapping = clone.getBucketFileNameMapping(); this.dumpFilePrefix = clone.getDumpFilePrefix(); + this.bucketMapjoinContext = new BucketMapJoinContext(clone); } @@ -362,34 +356,11 @@ this.valueTblDescs = valueTblDescs; } - /** - * @return bigTableAlias - */ - public String getBigTableAlias() { - return bigTableAlias; + public BucketMapJoinContext getBucketMapjoinContext() { + return bucketMapjoinContext; } - /** - * @param bigTableAlias - */ - public void setBigTableAlias(String bigTableAlias) { - this.bigTableAlias = bigTableAlias; + public void setBucketMapjoinContext(BucketMapJoinContext bucketMapjoinContext) { + this.bucketMapjoinContext = bucketMapjoinContext; } - - public LinkedHashMap>> getAliasBucketFileNameMapping() { - return aliasBucketFileNameMapping; - } - - public void setAliasBucketFileNameMapping( - LinkedHashMap>> aliasBucketFileNameMapping) { - this.aliasBucketFileNameMapping = aliasBucketFileNameMapping; - } - - public LinkedHashMap getBucketFileNameMapping() { - return bucketFileNameMapping; - } - - public void setBucketFileNameMapping(LinkedHashMap bucketFileNameMapping) { - this.bucketFileNameMapping = bucketFileNameMapping; - } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapredLocalWork.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredLocalWork.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredLocalWork.java (working copy) @@ -18,16 +18,11 @@ package org.apache.hadoop.hive.ql.plan; -import java.io.File; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; -import java.util.Map.Entry; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.exec.BucketMatcher; import org.apache.hadoop.hive.ql.exec.Operator; /** @@ -132,6 +127,11 @@ } @Explain(displayName = "Bucket Mapjoin Context", normalExplain = false) + public BucketMapJoinContext getBucketMapjoinContextExplain() { + return bucketMapjoinContext != null && + bucketMapjoinContext.getBucketFileNameMapping() != null ? bucketMapjoinContext : null; + } + public BucketMapJoinContext getBucketMapjoinContext() { return bucketMapjoinContext; } @@ -140,6 +140,17 @@ this.bucketMapjoinContext = bucketMapjoinContext; } + public BucketMapJoinContext copyPartSpecMappingOnly() { + if (bucketMapjoinContext != null && + bucketMapjoinContext.getBigTablePartSpecToFileMapping() != null) { + BucketMapJoinContext context = new BucketMapJoinContext(); + context.setBigTablePartSpecToFileMapping( + bucketMapjoinContext.getBigTablePartSpecToFileMapping()); + return context; + } + return null; + } + public void setTmpFileURI(String tmpFileURI) { this.tmpFileURI = tmpFileURI; } @@ -148,121 +159,22 @@ return tmpFileURI; } - public static class BucketMapJoinContext implements Serializable { - - private static final long serialVersionUID = 1L; - - // used for bucket map join - private LinkedHashMap>> aliasBucketFileNameMapping; - private String mapJoinBigTableAlias; - private Class bucketMatcherClass; - - private LinkedHashMap>> aliasBucketBaseFileNameMapping; - private LinkedHashMap bucketFileNameMapping; - - public void setMapJoinBigTableAlias(String bigTableAlias) { - this.mapJoinBigTableAlias = bigTableAlias; + public String getBucketFileName(String bigFileName) { + if (!inputFileChangeSensitive || bigFileName == null || bigFileName.isEmpty()) { + return "-"; } - - - public void deriveBucketMapJoinMapping() { - if (aliasBucketFileNameMapping != null) { - Iterator>>> iter = - aliasBucketFileNameMapping.entrySet().iterator(); - aliasBucketBaseFileNameMapping = new LinkedHashMap>>(); - - while (iter.hasNext()) { - Entry>> old = iter.next(); - - LinkedHashMap> newBucketBaseFileNameMapping = new LinkedHashMap>(); - Iterator>> oldAliasFileNameMappingIter = old.getValue().entrySet().iterator(); - while (oldAliasFileNameMappingIter.hasNext()) { - //For a give table and its bucket full file path list, only keep the base file name (remove file path etc). - //And put the new list into the new mapping. - Entry> oldTableBucketFileNames = oldAliasFileNameMappingIter.next(); - ArrayList oldTableBucketNames = oldTableBucketFileNames.getValue(); - ArrayList newTableBucketFileBaseName = new ArrayList (oldTableBucketNames.size()); - //for each bucket file, only keep its base files and store into a new list. - if (oldTableBucketNames != null) { - for (String bucketFName : oldTableBucketNames) { - newTableBucketFileBaseName.add(getBaseFileName(bucketFName)); - } - } - String bigTblBucketFileName = getBaseFileName(oldTableBucketFileNames.getKey()); - if(newBucketBaseFileNameMapping.containsKey(bigTblBucketFileName)) { - String fullPath = oldTableBucketFileNames.getKey(); - String dir = getBaseFileName(fullPath.substring(0, fullPath.lastIndexOf(bigTblBucketFileName))); - bigTblBucketFileName = dir + File.separator + bigTblBucketFileName; - } - //put the new mapping - newBucketBaseFileNameMapping.put(bigTblBucketFileName, newTableBucketFileBaseName); - } - String tableAlias = old.getKey(); - aliasBucketBaseFileNameMapping.put(tableAlias, newBucketBaseFileNameMapping); - } - } + String fileName = getFileName(bigFileName); + if (bucketMapjoinContext != null) { + fileName = bucketMapjoinContext.createFileName(bigFileName, fileName); } + return fileName; + } - private String getBaseFileName (String path) { - try { - return ((new Path(path)).getName()); - } catch (Exception ex) { - // This could be due to either URI syntax error or File constructor - // illegal arg; we don't really care which one it is. - return path; - } + private String getFileName(String path) { + int last_separator = path.lastIndexOf(Path.SEPARATOR); + if (last_separator < 0) { + return path; } - - public String getMapJoinBigTableAlias() { - return mapJoinBigTableAlias; - } - - public Class getBucketMatcherClass() { - return bucketMatcherClass; - } - - public void setBucketMatcherClass( - Class bucketMatcherClass) { - this.bucketMatcherClass = bucketMatcherClass; - } - - @Explain(displayName = "Alias Bucket File Name Mapping", normalExplain = false) - public LinkedHashMap>> getAliasBucketFileNameMapping() { - return aliasBucketFileNameMapping; - } - - public void setAliasBucketFileNameMapping( - LinkedHashMap>> aliasBucketFileNameMapping) { - this.aliasBucketFileNameMapping = aliasBucketFileNameMapping; - } - - @Override - public String toString() { - if (aliasBucketFileNameMapping != null) { - return "Mapping:" + aliasBucketFileNameMapping.toString(); - } else { - return ""; - } - } - - @Explain(displayName = "Alias Bucket Base File Name Mapping", normalExplain = false) - public LinkedHashMap>> getAliasBucketBaseFileNameMapping() { - return aliasBucketBaseFileNameMapping; - } - - public void setAliasBucketBaseFileNameMapping( - LinkedHashMap>> aliasBucketBaseFileNameMapping) { - this.aliasBucketBaseFileNameMapping = aliasBucketBaseFileNameMapping; - } - - @Explain(displayName = "Alias Bucket Output File Name Mapping", normalExplain = false) - public LinkedHashMap getBucketFileNameMapping() { - return bucketFileNameMapping; - } - - public void setBucketFileNameMapping(LinkedHashMap bucketFileNameMapping) { - this.bucketFileNameMapping = bucketFileNameMapping; - } - + return path.substring(last_separator + 1); } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java (working copy) @@ -0,0 +1,216 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.BucketMatcher; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +/** + * was inner class of MapreLocalWork. context for bucket mapjoin (or smb join) + */ +public class BucketMapJoinContext implements Serializable { + + private static final long serialVersionUID = 1L; + + // table alias (small) --> input file name (big) --> target file names (small) + private Map>> aliasBucketFileNameMapping; + private String mapJoinBigTableAlias; + private Class bucketMatcherClass; + + // summary of aliasBucketFileNameMapping for test result + // full paths are replaced with base filenames + private transient Map>> aliasBucketBaseFileNameMapping; + + // input file name (big) to bucket number + private Map bucketFileNameMapping; + + // partition spec string to input file names (big) + private Map> bigTablePartSpecToFileMapping; + + // inverse of partSpecToFileMapping, populated at runtime + private transient Map inputToPartSpecMapping; + + public BucketMapJoinContext() {} + + public BucketMapJoinContext(MapJoinDesc clone) { + this.mapJoinBigTableAlias = clone.getBigTableAlias(); + this.aliasBucketFileNameMapping = clone.getAliasBucketFileNameMapping(); + this.bucketFileNameMapping = clone.getBigTableBucketNumMapping(); + this.bigTablePartSpecToFileMapping = clone.getBigTablePartSpecToFileMapping(); + } + + public void setMapJoinBigTableAlias(String bigTableAlias) { + this.mapJoinBigTableAlias = bigTableAlias; + } + + public void deriveBucketMapJoinMapping() { + if (aliasBucketFileNameMapping != null) { + aliasBucketBaseFileNameMapping = new LinkedHashMap>>(); + + for (Map.Entry>> aliasToMappins + : aliasBucketFileNameMapping.entrySet()) { + String tableAlias = aliasToMappins.getKey(); + Map> fullPathMappings = aliasToMappins.getValue(); + + Map> baseFileNameMapping = new LinkedHashMap>(); + for (Map.Entry> inputToBuckets : fullPathMappings.entrySet()) { + // For a given table and its bucket full file path list, + // only keep the base file name (remove file path etc). + // And put the new list into the new mapping. + String inputPath = inputToBuckets.getKey(); + List bucketPaths = inputToBuckets.getValue(); + + List bucketBaseFileNames = new ArrayList(bucketPaths.size()); + //for each bucket file, only keep its base files and store into a new list. + for (String bucketFName : bucketPaths) { + bucketBaseFileNames.add(getBaseFileName(bucketFName)); + } + //put the new mapping + baseFileNameMapping.put(getBaseFileName(inputPath), bucketBaseFileNames); + } + aliasBucketBaseFileNameMapping.put(tableAlias, baseFileNameMapping); + } + } + } + + private static final Pattern partPattern = Pattern.compile("^[^=]+=[^=]+$"); + + // extract partition spec to file name part from path + private String getBaseFileName(String string) { + try { + Path path = new Path(string); + Path cursor = path.getParent(); + while (partPattern.matcher(cursor.getName()).matches()) { + cursor = cursor.getParent(); + } + return cursor.toUri().relativize(path.toUri()).getPath(); + } catch (Exception ex) { + // This could be due to either URI syntax error or File constructor + // illegal arg; we don't really care which one it is. + return string; + } + } + + public String getMapJoinBigTableAlias() { + return mapJoinBigTableAlias; + } + + public Class getBucketMatcherClass() { + return bucketMatcherClass; + } + + public void setBucketMatcherClass( + Class bucketMatcherClass) { + this.bucketMatcherClass = bucketMatcherClass; + } + + @Explain(displayName = "Alias Bucket File Name Mapping", normalExplain = false) + public Map>> getAliasBucketFileNameMapping() { + return aliasBucketFileNameMapping; + } + + public void setAliasBucketFileNameMapping( + Map>> aliasBucketFileNameMapping) { + this.aliasBucketFileNameMapping = aliasBucketFileNameMapping; + } + + @Override + public String toString() { + if (aliasBucketFileNameMapping != null) { + return "Mapping:" + aliasBucketFileNameMapping.toString(); + } else { + return ""; + } + } + + @Explain(displayName = "Alias Bucket Base File Name Mapping", normalExplain = false) + public Map>> getAliasBucketBaseFileNameMapping() { + return aliasBucketBaseFileNameMapping; + } + + public void setAliasBucketBaseFileNameMapping( + Map>> aliasBucketBaseFileNameMapping) { + this.aliasBucketBaseFileNameMapping = aliasBucketBaseFileNameMapping; + } + + @Explain(displayName = "Alias Bucket Output File Name Mapping", normalExplain = false) + public Map getBucketFileNameMapping() { + return bucketFileNameMapping; + } + + public void setBucketFileNameMapping(Map bucketFileNameMapping) { + this.bucketFileNameMapping = bucketFileNameMapping; + } + + public Map> getBigTablePartSpecToFileMapping() { + return bigTablePartSpecToFileMapping; + } + + public void setBigTablePartSpecToFileMapping( + Map> bigTablePartSpecToFileMapping) { + this.bigTablePartSpecToFileMapping = bigTablePartSpecToFileMapping; + } + + // returns fileId for SMBJoin, which consists part of result file name + // needed to avoid file name conflict when big table is partitioned + public String createFileId(String inputPath) { + String bucketNum = String.valueOf(bucketFileNameMapping.get(inputPath)); + if (bigTablePartSpecToFileMapping != null) { + // partSpecToFileMapping is null if big table is partitioned + return prependPartSpec(inputPath, bucketNum); + } + return bucketNum; + } + + // returns name of hashfile made by HASHTABLESINK which is read by MAPJOIN + public String createFileName(String inputPath, String fileName) { + if (bigTablePartSpecToFileMapping != null) { + // partSpecToFileMapping is null if big table is partitioned + return prependPartSpec(inputPath, fileName); + } + return fileName; + } + + // prepends partition spec of input path to candidate file name + private String prependPartSpec(String inputPath, String fileName) { + Map mapping = inputToPartSpecMapping == null ? + inputToPartSpecMapping = revert(bigTablePartSpecToFileMapping) : inputToPartSpecMapping; + String partSpec = mapping.get(inputPath); + return partSpec == null || partSpec.isEmpty() ? fileName : "(" + partSpec + ")" + fileName; + } + + // revert partSpecToFileMapping to inputToPartSpecMapping + private Map revert(Map> mapping) { + Map converted = new HashMap(); + for (Map.Entry> entry : mapping.entrySet()) { + String partSpec = entry.getKey(); + for (String file : entry.getValue()) { + converted.put(file, partSpec); + } + } + return converted; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java (revision 1366995) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java (working copy) @@ -47,14 +47,15 @@ private transient String bigTableAlias; - private LinkedHashMap>> aliasBucketFileNameMapping; - private LinkedHashMap bucketFileNameMapping; + private Map>> aliasBucketFileNameMapping; + private Map bigTableBucketNumMapping; + private Map> bigTablePartSpecToFileMapping; //map join dump file name private String dumpFilePrefix; public MapJoinDesc() { - bucketFileNameMapping = new LinkedHashMap(); + bigTableBucketNumMapping = new LinkedHashMap(); } public MapJoinDesc(MapJoinDesc clone) { @@ -66,7 +67,8 @@ this.retainList = clone.retainList; this.bigTableAlias = clone.bigTableAlias; this.aliasBucketFileNameMapping = clone.aliasBucketFileNameMapping; - this.bucketFileNameMapping = clone.bucketFileNameMapping; + this.bigTableBucketNumMapping = clone.bigTableBucketNumMapping; + this.bigTablePartSpecToFileMapping = clone.bigTablePartSpecToFileMapping; this.dumpFilePrefix = clone.dumpFilePrefix; } @@ -81,7 +83,7 @@ this.valueTblDescs = valueTblDescs; this.valueFilteredTblDescs = valueFilteredTblDescs; this.posBigTable = posBigTable; - this.bucketFileNameMapping = new LinkedHashMap(); + this.bigTableBucketNumMapping = new LinkedHashMap(); this.dumpFilePrefix = dumpFilePrefix; initRetainExprList(); } @@ -207,20 +209,28 @@ this.bigTableAlias = bigTableAlias; } - public LinkedHashMap>> getAliasBucketFileNameMapping() { + public Map>> getAliasBucketFileNameMapping() { return aliasBucketFileNameMapping; } public void setAliasBucketFileNameMapping( - LinkedHashMap>> aliasBucketFileNameMapping) { + Map>> aliasBucketFileNameMapping) { this.aliasBucketFileNameMapping = aliasBucketFileNameMapping; } - public LinkedHashMap getBucketFileNameMapping() { - return bucketFileNameMapping; + public Map getBigTableBucketNumMapping() { + return bigTableBucketNumMapping; } - public void setBucketFileNameMapping(LinkedHashMap bucketFileNameMapping) { - this.bucketFileNameMapping = bucketFileNameMapping; + public void setBigTableBucketNumMapping(Map bigTableBucketNumMapping) { + this.bigTableBucketNumMapping = bigTableBucketNumMapping; } + + public Map> getBigTablePartSpecToFileMapping() { + return bigTablePartSpecToFileMapping; + } + + public void setBigTablePartSpecToFileMapping(Map> partToFileMapping) { + this.bigTablePartSpecToFileMapping = partToFileMapping; + } }