diff --git .gitattributes .gitattributes index 451debf..fe90dd9 100644 --- .gitattributes +++ .gitattributes @@ -11,6 +11,7 @@ *.js text *.sql text *.q text +*.q.out text diff *.sh text eol=lf diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java index c951fca..34b063d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapredLocalTask.java @@ -19,7 +19,6 @@ import java.io.File; import java.io.IOException; -import java.io.ObjectOutputStream; import java.io.OutputStream; import java.io.Serializable; import java.lang.management.ManagementFactory; @@ -46,7 +45,6 @@ import org.apache.hadoop.hive.ql.QueryPlan; import org.apache.hadoop.hive.ql.exec.BucketMatcher; import org.apache.hadoop.hive.ql.exec.FetchOperator; -import org.apache.hadoop.hive.ql.exec.HashTableSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.SecureCmdDoAs; import org.apache.hadoop.hive.ql.exec.TableScanOperator; @@ -54,7 +52,6 @@ import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter; import org.apache.hadoop.hive.ql.exec.mapjoin.MapJoinMemoryExhaustionException; -import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe; import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext; @@ -340,6 +337,12 @@ public void startForward(String bigTableBucket) throws Exception { private void startForward(boolean inputFileChangeSenstive, String bigTableBucket) throws Exception { + for (Operator source : work.getAliasToWork().values()) { + source.reset(); + } + if (inputFileChangeSenstive) { + execContext.setCurrentBigBucketFile(bigTableBucket); + } for (Map.Entry entry : fetchOperators.entrySet()) { String alias = entry.getKey(); FetchOperator fetchOp = entry.getValue(); @@ -351,13 +354,6 @@ private void startForward(boolean inputFileChangeSenstive, String bigTableBucket // get the root operator Operator forwardOp = work.getAliasToWork().get(alias); - if (fetchOp.isEmptyTable()) { - //generate empty hashtable for empty table - this.generateDummyHashTable(alias, bigTableBucket); - forwardOp.close(false); - continue; - } - // walk through the operator tree while (!forwardOp.getDone()) { InspectableObject row = fetchOp.getNextRow(); @@ -366,11 +362,10 @@ private void startForward(boolean inputFileChangeSenstive, String bigTableBucket } forwardOp.processOp(row.o, 0); } - if (inputFileChangeSenstive) { - execContext.setCurrentBigBucketFile(bigTableBucket); - forwardOp.reset(); - } - forwardOp.close(false); + forwardOp.flush(); + } + for (Operator source : work.getAliasToWork().values()) { + source.close(false); } } @@ -421,43 +416,6 @@ private void initializeOperators(Map fetchOpJobConfMap) } } - private void generateDummyHashTable(String alias, String bigBucketFileName) - throws HiveException,IOException { - LOG.debug("generating dummy for " + alias); - // find the (byte)tag for the map join(HashTableSinkOperator) - Operator parentOp = work.getAliasToWork().get(alias); - Operator childOp = parentOp.getChildOperators().get(0); - while ((childOp != null) && (!(childOp instanceof HashTableSinkOperator))) { - parentOp = childOp; - assert parentOp.getChildOperators().size() == 1; - childOp = parentOp.getChildOperators().get(0); - } - if (childOp == null) { - throw new HiveException( - "Cannot find HashTableSink op by tracing down the table scan operator tree"); - } - byte tag = (byte) childOp.getParentOperators().indexOf(parentOp); - - // generate empty hashtable for this (byte)tag - Path tmpPath = this.getWork().getTmpPath(); - - String fileName = work.getBucketFileName(bigBucketFileName); - - HashTableSinkOperator htso = (HashTableSinkOperator)childOp; - Path path = Utilities.generatePath(tmpPath, htso.getConf().getDumpFilePrefix(), - tag, fileName); - console.printInfo(Utilities.now() + "\tDump the hashtable into file: " + path); - FileSystem fs = path.getFileSystem(job); - ObjectOutputStream out = new ObjectOutputStream(fs.create(path)); - try { - MapJoinTableContainerSerDe.persistDummyTable(out); - } finally { - out.close(); - } - console.printInfo(Utilities.now() + "\tUpload 1 File to: " + path + " File size: " - + fs.getFileStatus(path).getLen()); - } - private void setUpFetchOpContext(FetchOperator fetchOp, String alias, String currentInputFile) throws Exception { diff --git ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q index da2e26f..73e98dd 100644 --- ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q +++ ql/src/test/queries/clientpositive/auto_sortmerge_join_11.q @@ -34,3 +34,7 @@ select count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; -- The join is converted to a bucketed mapjoin with a mapjoin hint explain extended select /*+ mapjoin(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; select /*+ mapjoin(a) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key; + +-- HIVE-7023 +explain extended select /* + MAPJOIN(a,b) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key JOIN bucket_big c ON a.key = c.key; +select /* + MAPJOIN(a,b) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key JOIN bucket_big c ON a.key = c.key; diff --git ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out index e502e97..97bb81f 100644 --- ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out +++ ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out @@ -1103,3 +1103,443 @@ POSTHOOK: Input: default@bucket_small POSTHOOK: Input: default@bucket_small@ds=2008-04-08 #### A masked pattern was here #### 38 +PREHOOK: query: -- HIVE-7023 +explain extended select /* + MAPJOIN(a,b) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key JOIN bucket_big c ON a.key = c.key +PREHOOK: type: QUERY +POSTHOOK: query: -- HIVE-7023 +explain extended select /* + MAPJOIN(a,b) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key JOIN bucket_big c ON a.key = c.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_JOIN + TOK_JOIN + TOK_TABREF + TOK_TABNAME + bucket_small + a + TOK_TABREF + TOK_TABNAME + bucket_big + b + = + . + TOK_TABLE_OR_COL + a + key + . + TOK_TABLE_OR_COL + b + key + TOK_TABREF + TOK_TABNAME + bucket_big + c + = + . + TOK_TABLE_OR_COL + a + key + . + TOK_TABLE_OR_COL + c + key + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_HINTLIST + TOK_HINT + TOK_MAPJOIN + TOK_HINTARGLIST + a + b + TOK_SELEXPR + TOK_FUNCTIONSTAR + count + + +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-1 depends on stages: Stage-4 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + COLUMN_STATS_ACCURATE true + bucket_count 2 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + numFiles 2 + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 114 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 2 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_small + partition_columns ds + partition_columns.types string + serialization.ddl struct bucket_small { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_small + name: default.bucket_small + b + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + partition_columns.types string + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Partition + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + partition_columns.types string + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 1 Data size: 114 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + Position of Big Table: 2 + b + TableScan + alias: b + Statistics: Num rows: 116 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + HashTable Sink Operator + condition expressions: + 0 + 1 + 2 + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + Position of Big Table: 2 + Bucket Mapjoin Context: + Alias Bucket Base File Name Mapping: + a {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/smallsrcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/smallsrcsortbucket2outof4.txt], ds=2008-04-08/srcsortbucket3outof4.txt=[ds=2008-04-08/smallsrcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket4outof4.txt=[ds=2008-04-08/smallsrcsortbucket2outof4.txt], ds=2008-04-09/srcsortbucket1outof4.txt=[ds=2008-04-08/smallsrcsortbucket1outof4.txt], ds=2008-04-09/srcsortbucket2outof4.txt=[ds=2008-04-08/smallsrcsortbucket2outof4.txt], ds=2008-04-09/srcsortbucket3outof4.txt=[ds=2008-04-08/smallsrcsortbucket1outof4.txt], ds=2008-04-09/srcsortbucket4outof4.txt=[ds=2008-04-08/smallsrcsortbucket2outof4.txt]} + b {ds=2008-04-08/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt], ds=2008-04-08/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt], ds=2008-04-08/srcsortbucket3outof4.txt=[ds=2008-04-08/srcsortbucket3outof4.txt, ds=2008-04-09/srcsortbucket3outof4.txt], ds=2008-04-08/srcsortbucket4outof4.txt=[ds=2008-04-08/srcsortbucket4outof4.txt, ds=2008-04-09/srcsortbucket4outof4.txt], ds=2008-04-09/srcsortbucket1outof4.txt=[ds=2008-04-08/srcsortbucket1outof4.txt, ds=2008-04-09/srcsortbucket1outof4.txt], ds=2008-04-09/srcsortbucket2outof4.txt=[ds=2008-04-08/srcsortbucket2outof4.txt, ds=2008-04-09/srcsortbucket2outof4.txt], ds=2008-04-09/srcsortbucket3outof4.txt=[ds=2008-04-08/srcsortbucket3outof4.txt, ds=2008-04-09/srcsortbucket3outof4.txt], ds=2008-04-09/srcsortbucket4outof4.txt=[ds=2008-04-08/srcsortbucket4outof4.txt, ds=2008-04-09/srcsortbucket4outof4.txt]} + Alias Bucket File Name Mapping: +#### A masked pattern was here #### + Alias Bucket Output File Name Mapping: +#### A masked pattern was here #### + + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: c + Statistics: Num rows: 116 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 + 1 + 2 + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + Position of Big Table: 2 + Statistics: Num rows: 255 Data size: 25572 Basic stats: COMPLETE Column stats: NONE + BucketMapJoin: true + Select Operator + Statistics: Num rows: 255 Data size: 25572 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-08 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + partition_columns.types string + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big +#### A masked pattern was here #### + Partition + base file name: ds=2008-04-09 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-09 + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + numFiles 4 + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.bucket_big + partition_columns ds + partition_columns.types string + serialization.ddl struct bucket_big { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucket_big + name: default.bucket_big + Truncated Path -> Alias: + /bucket_big/ds=2008-04-08 [c] + /bucket_big/ds=2008-04-09 [c] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select /* + MAPJOIN(a,b) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key JOIN bucket_big c ON a.key = c.key +PREHOOK: type: QUERY +PREHOOK: Input: default@bucket_big +PREHOOK: Input: default@bucket_big@ds=2008-04-08 +PREHOOK: Input: default@bucket_big@ds=2008-04-09 +PREHOOK: Input: default@bucket_small +PREHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +POSTHOOK: query: select /* + MAPJOIN(a,b) */ count(*) FROM bucket_small a JOIN bucket_big b ON a.key = b.key JOIN bucket_big c ON a.key = c.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@bucket_big +POSTHOOK: Input: default@bucket_big@ds=2008-04-08 +POSTHOOK: Input: default@bucket_big@ds=2008-04-09 +POSTHOOK: Input: default@bucket_small +POSTHOOK: Input: default@bucket_small@ds=2008-04-08 +#### A masked pattern was here #### +180