diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java index b4aeb14..53f0cb3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java @@ -380,11 +380,9 @@ public static MapJoinOperator convertMapJoin(HiveConf conf, // remove old parents for (pos = 0; pos < newParentOps.size(); pos++) { - newParentOps.get(pos).removeChild(oldReduceSinkParentOps.get(pos)); - newParentOps.get(pos).getChildOperators().add(mapJoinOp); + newParentOps.get(pos).replaceChild(oldReduceSinkParentOps.get(pos), mapJoinOp); } - mapJoinOp.getParentOperators().removeAll(oldReduceSinkParentOps); mapJoinOp.setParentOperators(newParentOps); @@ -835,6 +833,7 @@ private int mapSideJoin(JoinOperator op, QBJoinTree joinTree) throws SemanticExc * @param pactx * current parse context */ + @Override public ParseContext transform(ParseContext pactx) throws SemanticException { pGraphContext = pactx; List listMapJoinOps = new ArrayList(); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java index 74ca355..0b9b973 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/CommonJoinTaskDispatcher.java @@ -434,6 +434,8 @@ public static boolean cannotConvert(String bigTableAlias, Set bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc .getConds()); + bigTableCandidates = multiInsertBigTableCheck(currWork, joinOp, bigTableCandidates); + // no table could be the big table; there is no need to convert if (bigTableCandidates.isEmpty()) { return null; @@ -600,4 +602,42 @@ private JoinOperator getJoinOp(MapRedTask task) throws SemanticException { return null; } } + + /* + * In the case of a multi-insert statement the Source Operator will have multiple children. + * For e.g. + * from src b + * INSERT OVERWRITE TABLE src_4 + * select * + * where b.key in + * (select a.key from src a where b.value = a.value and a.key > '9') + * INSERT OVERWRITE TABLE src_5 + * select * + * where b.key not in + * ( select key from src s1 where s1.key > '2') + * + * The TableScan on 'src'(for alias b) will have 2 children one for each destination. + * + * In such cases only the Source side of the Join is the candidate Big Table. + * The reason being, it cannot be replaced by a HashTable as its rows must flow into the other children + * of the TableScan Operator. + */ + private Set multiInsertBigTableCheck(MapWork currWork, JoinOperator joinOp, Set bigTableCandidates) { + Map> aliasToWork = currWork.getAliasToWork(); + + for (Map.Entry> workEntry : aliasToWork.entrySet()) { + String alias = workEntry.getKey(); + Operator op = workEntry.getValue(); + if ( op.getChildOperators().size() > 1 ) { + int tablePosition = getPosition(currWork, joinOp, alias); + if ( !bigTableCandidates.contains(tablePosition)) { + bigTableCandidates.clear(); + } else { + bigTableCandidates.clear(); + bigTableCandidates.add(tablePosition); + } + } + } + return bigTableCandidates; + } } diff --git ql/src/test/queries/clientpositive/subquery_multiinsert.q ql/src/test/queries/clientpositive/subquery_multiinsert.q index 1f65b16..f65696c 100644 --- ql/src/test/queries/clientpositive/subquery_multiinsert.q +++ ql/src/test/queries/clientpositive/subquery_multiinsert.q @@ -42,4 +42,39 @@ INSERT OVERWRITE TABLE src_5 select * from src_4 ; select * from src_5 +; +set hive.auto.convert.join=true; + +explain +from src b +INSERT OVERWRITE TABLE src_4 + select * + where b.key in + (select a.key + from src a + where b.value = a.value and a.key > '9' + ) +INSERT OVERWRITE TABLE src_5 + select * + where b.key not in ( select key from src s1 where s1.key > '2') + order by key +; + +from src b +INSERT OVERWRITE TABLE src_4 + select * + where b.key in + (select a.key + from src a + where b.value = a.value and a.key > '9' + ) +INSERT OVERWRITE TABLE src_5 + select * + where b.key not in ( select key from src s1 where s1.key > '2') + order by key +; + +select * from src_4 +; +select * from src_5 ; \ No newline at end of file diff --git ql/src/test/results/clientpositive/subquery_multiinsert.q.out ql/src/test/results/clientpositive/subquery_multiinsert.q.out index fdafea3..83e9d77 100644 --- ql/src/test/results/clientpositive/subquery_multiinsert.q.out +++ ql/src/test/results/clientpositive/subquery_multiinsert.q.out @@ -482,3 +482,523 @@ POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:s 199 val_199 199 val_199 2 val_2 +PREHOOK: query: explain +from src b +INSERT OVERWRITE TABLE src_4 + select * + where b.key in + (select a.key + from src a + where b.value = a.value and a.key > '9' + ) +INSERT OVERWRITE TABLE src_5 + select * + where b.key not in ( select key from src s1 where s1.key > '2') + order by key +PREHOOK: type: QUERY +POSTHOOK: query: explain +from src b +INSERT OVERWRITE TABLE src_4 + select * + where b.key in + (select a.key + from src a + where b.value = a.value and a.key > '9' + ) +INSERT OVERWRITE TABLE src_5 + select * + where b.key not in ( select key from src s1 where s1.key > '2') + order by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: src_4.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +STAGE DEPENDENCIES: + Stage-10 is a root stage + Stage-13 depends on stages: Stage-10, Stage-14 , consists of Stage-12, Stage-4 + Stage-12 has a backup stage: Stage-4 + Stage-15 depends on stages: Stage-4, Stage-12 + Stage-6 depends on stages: Stage-15 + Stage-1 depends on stages: Stage-6 + Stage-7 depends on stages: Stage-1 + Stage-4 + Stage-17 is a root stage + Stage-14 depends on stages: Stage-17 + Stage-0 depends on stages: Stage-14 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-10 + Map Reduce + Map Operator Tree: + TableScan + alias: s1 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key > '2') and key is null) (type: boolean) + Statistics: Num rows: 9 Data size: 901 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 9 Data size: 901 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (_col0 = 0) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + keys: _col0 (type: bigint) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-13 + Conditional Operator + + Stage: Stage-12 + Map Reduce + Map Operator Tree: + TableScan + Map Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Local Work: + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME1 + TableScan + + Stage: Stage-15 + Map Reduce Local Work + Alias -> Map Local Tables: + sq_2:s1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + sq_2:s1 + TableScan + alias: s1 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key > '2') (type: boolean) + Statistics: Num rows: 19 Data size: 1903 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 19 Data size: 1903 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {_col0} {_col1} + 1 {_col0} + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + + Stage: Stage-6 + Map Reduce + Map Operator Tree: + TableScan + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {_col0} + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col4 + Statistics: Num rows: 34 Data size: 7032 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((1 = 1) and _col4 is null) (type: boolean) + Statistics: Num rows: 8 Data size: 1654 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 8 Data size: 1654 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 8 Data size: 1654 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Extract + Statistics: Num rows: 8 Data size: 1654 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 8 Data size: 1654 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src_5 + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src_5 + + Stage: Stage-7 + Stats-Aggr Operator + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 29 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + value expressions: key (type: string), value (type: string) + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 31 Data size: 6393 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-17 + Map Reduce Local Work + Alias -> Map Local Tables: + sq_1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + sq_1:a + TableScan + alias: a + Statistics: Num rows: 29 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key > '9') (type: boolean) + Statistics: Num rows: 9 Data size: 1803 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 9 Data size: 1803 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string), _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 9 Data size: 1803 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 _col0 (type: string), _col1 (type: string) + + Stage: Stage-14 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 29 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 31 Data size: 6393 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (1 = 1) (type: boolean) + Statistics: Num rows: 15 Data size: 3093 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 3093 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 15 Data size: 3093 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src_4 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src_4 + + Stage: Stage-3 + Stats-Aggr Operator + +PREHOOK: query: from src b +INSERT OVERWRITE TABLE src_4 + select * + where b.key in + (select a.key + from src a + where b.value = a.value and a.key > '9' + ) +INSERT OVERWRITE TABLE src_5 + select * + where b.key not in ( select key from src s1 where s1.key > '2') + order by key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@src_4 +PREHOOK: Output: default@src_5 +POSTHOOK: query: from src b +INSERT OVERWRITE TABLE src_4 + select * + where b.key in + (select a.key + from src a + where b.value = a.value and a.key > '9' + ) +INSERT OVERWRITE TABLE src_5 + select * + where b.key not in ( select key from src s1 where s1.key > '2') + order by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@src_4 +POSTHOOK: Output: default@src_5 +POSTHOOK: Lineage: src_4.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: select * from src_4 +PREHOOK: type: QUERY +PREHOOK: Input: default@src_4 +#### A masked pattern was here #### +POSTHOOK: query: select * from src_4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_4 +#### A masked pattern was here #### +POSTHOOK: Lineage: src_4.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +98 val_98 +92 val_92 +96 val_96 +95 val_95 +98 val_98 +90 val_90 +95 val_95 +90 val_90 +97 val_97 +90 val_90 +97 val_97 +PREHOOK: query: select * from src_5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src_5 +#### A masked pattern was here #### +POSTHOOK: query: select * from src_5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_5 +#### A masked pattern was here #### +POSTHOOK: Lineage: src_4.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_4.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.key EXPRESSION [(src)b.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: src_5.value EXPRESSION [(src)b.FieldSchema(name:value, type:string, comment:default), ] +0 val_0 +0 val_0 +0 val_0 +10 val_10 +100 val_100 +100 val_100 +103 val_103 +103 val_103 +104 val_104 +104 val_104 +105 val_105 +11 val_11 +111 val_111 +113 val_113 +113 val_113 +114 val_114 +116 val_116 +118 val_118 +118 val_118 +119 val_119 +119 val_119 +119 val_119 +12 val_12 +12 val_12 +120 val_120 +120 val_120 +125 val_125 +125 val_125 +126 val_126 +128 val_128 +128 val_128 +128 val_128 +129 val_129 +129 val_129 +131 val_131 +133 val_133 +134 val_134 +134 val_134 +136 val_136 +137 val_137 +137 val_137 +138 val_138 +138 val_138 +138 val_138 +138 val_138 +143 val_143 +145 val_145 +146 val_146 +146 val_146 +149 val_149 +149 val_149 +15 val_15 +15 val_15 +150 val_150 +152 val_152 +152 val_152 +153 val_153 +155 val_155 +156 val_156 +157 val_157 +158 val_158 +160 val_160 +162 val_162 +163 val_163 +164 val_164 +164 val_164 +165 val_165 +165 val_165 +166 val_166 +167 val_167 +167 val_167 +167 val_167 +168 val_168 +169 val_169 +169 val_169 +169 val_169 +169 val_169 +17 val_17 +170 val_170 +172 val_172 +172 val_172 +174 val_174 +174 val_174 +175 val_175 +175 val_175 +176 val_176 +176 val_176 +177 val_177 +178 val_178 +179 val_179 +179 val_179 +18 val_18 +18 val_18 +180 val_180 +181 val_181 +183 val_183 +186 val_186 +187 val_187 +187 val_187 +187 val_187 +189 val_189 +19 val_19 +190 val_190 +191 val_191 +191 val_191 +192 val_192 +193 val_193 +193 val_193 +193 val_193 +194 val_194 +195 val_195 +195 val_195 +196 val_196 +197 val_197 +197 val_197 +199 val_199 +199 val_199 +199 val_199 +2 val_2