diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index b0b5b2e..5d07fba 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -99,6 +99,7 @@ minitez.query.files.shared=alter_merge_2_orc.q,\ groupby2.q,\ groupby3.q,\ having.q,\ + identity_project_remove_skip.q\ insert1.q,\ insert_into1.q,\ insert_into2.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java index 60ef9dd..433699b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java @@ -24,10 +24,13 @@ import java.util.Map; import java.util.Stack; +import com.google.common.base.Predicates; +import com.google.common.collect.Iterators; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; @@ -91,6 +94,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } Operator parent = parents.get(0); + if (parent instanceof ReduceSinkOperator && Iterators.any(sel.getChildOperators().iterator(), + Predicates.instanceOf(ReduceSinkOperator.class))) { + // For RS-SEL-RS case. reducer operator in reducer task cannot be null in task compiler + return null; + } if(sel.isIdentitySelect()) { parent.removeChildAndAdoptItsChildren(sel); LOG.debug("Identity project remover optimization removed : " + sel); diff --git ql/src/test/queries/clientpositive/identity_project_remove_skip.q ql/src/test/queries/clientpositive/identity_project_remove_skip.q new file mode 100644 index 0000000..472f20a --- /dev/null +++ ql/src/test/queries/clientpositive/identity_project_remove_skip.q @@ -0,0 +1,20 @@ +set hive.optimize.remove.identity.project=true; +set hive.auto.convert.join=true; +set hive.optimize.ppd=true; + +explain +select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105'; + +select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105'; diff --git ql/src/test/results/clientpositive/identity_project_remove_skip.q.out ql/src/test/results/clientpositive/identity_project_remove_skip.q.out new file mode 100644 index 0000000..103a42e --- /dev/null +++ ql/src/test/results/clientpositive/identity_project_remove_skip.q.out @@ -0,0 +1,217 @@ +PREHOOK: query: explain +select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1, Stage-3 , consists of Stage-7, Stage-8, Stage-2 + Stage-7 has a backup stage: Stage-2 + Stage-4 depends on stages: Stage-7 + Stage-8 has a backup stage: Stage-2 + Stage-5 depends on stages: Stage-8 + Stage-2 + Stage-3 is a root stage + Stage-0 depends on stages: Stage-4, Stage-5, Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key is not null and (value = 'val_105')) and (key = '105')) (type: boolean) + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: '105' (type: string) + sort order: + + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Select Operator + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME1 + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME1 + TableScan + HashTable Sink Operator + keys: + 0 '105' (type: string) + 1 '105' (type: string) + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 '105' (type: string) + 1 '105' (type: string) + Select Operator + expressions: '105' (type: string), 'val_105' (type: string) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + $INTNAME + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $INTNAME + TableScan + HashTable Sink Operator + keys: + 0 '105' (type: string) + 1 '105' (type: string) + + Stage: Stage-5 + Map Reduce + Map Operator Tree: + TableScan + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 '105' (type: string) + 1 '105' (type: string) + Select Operator + expressions: '105' (type: string), 'val_105' (type: string) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: '105' (type: string) + sort order: + + Map-reduce partition columns: '' (type: string) + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + TableScan + Reduce Output Operator + key expressions: '105' (type: string) + sort order: + + Map-reduce partition columns: '' (type: string) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: '105' (type: string), 'val_105' (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = '105') (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: '105' (type: string) + sort order: + + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Select Operator + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +105 val_105 diff --git ql/src/test/results/clientpositive/tez/identity_project_remove_skip.q.out ql/src/test/results/clientpositive/tez/identity_project_remove_skip.q.out new file mode 100644 index 0000000..8ac424d --- /dev/null +++ ql/src/test/results/clientpositive/tez/identity_project_remove_skip.q.out @@ -0,0 +1,119 @@ +PREHOOK: query: explain +select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +PREHOOK: type: QUERY +POSTHOOK: query: explain +select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 4 <- Map 3 (SIMPLE_EDGE), Reducer 2 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key is not null and (value = 'val_105')) and (key = '105')) (type: boolean) + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: '105' (type: string) + sort order: + + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Map 3 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = '105') (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: '105' (type: string) + sort order: + + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Select Operator + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: '105' (type: string) + sort order: + + Map-reduce partition columns: '' (type: string) + Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Reducer 4 + Reduce Operator Tree: + Select Operator + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 '105' (type: string) + 1 '105' (type: string) + input vertices: + 0 Reducer 2 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: '105' (type: string), 'val_105' (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select t2.* +from + (select key,value from (select key,value from src) t1 sort by key) t2 + join + (select * from src sort by key) t3 + on (t2.key=t3.key ) + where t2.value='val_105' and t3.key='105' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +105 val_105