diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/NonBlockingOpDeDupProc.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/NonBlockingOpDeDupProc.java index 4313936..6d761a9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/NonBlockingOpDeDupProc.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/NonBlockingOpDeDupProc.java @@ -26,6 +26,7 @@ import java.util.Set; import java.util.Stack; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.SelectOperator; @@ -42,6 +43,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; @@ -84,7 +86,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // For SEL-SEL(compute) case, move column exprs/names of child to parent. if (!cSEL.getConf().isSelStarNoCompute()) { - Operator terminal = ExprNodeDescUtils.getSingleParent(pSEL, null); Set funcOutputs = getFunctionOutputs( pSEL.getConf().getOutputColumnNames(), pSEL.getConf().getColList()); @@ -93,7 +94,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, if (!funcOutputs.isEmpty() && !checkReferences(sources, funcOutputs)) { return null; } - pSEL.getConf().setColList(ExprNodeDescUtils.backtrack(sources, pSEL, terminal)); + + pSEL.getConf().setColList(merge(pSEL.getColumnExprMap(), pSEL.getConf().getColList(), + sources, true)); pSEL.getConf().setOutputColumnNames(cSEL.getConf().getOutputColumnNames()); // updates schema only (this should be the last optimizer modifying operator tree) @@ -121,6 +124,63 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return functionOutputs; } + private ExprNodeDesc merge(Map map, List pCols, + ExprNodeDesc cCol, boolean resolveColumn) throws SemanticException { + + if (cCol instanceof ExprNodeGenericFuncDesc) { + // all children expression should be resolved + ExprNodeGenericFuncDesc function = (ExprNodeGenericFuncDesc) cCol.clone(); + function.setChildExprs(merge(map, pCols, function.getChildren(), resolveColumn)); + return function; + } + + if (cCol instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc column = (ExprNodeColumnDesc) cCol; + String columnName = column.getColumn(); + + if (!resolveColumn) { + return column; + } + + // check if the column name is mapped + if (map != null && map.containsKey(columnName)) { + ExprNodeDesc expr = map.get(columnName); + return merge(map, pCols, expr, false); + } + + if (resolveColumn) { + // otherwise just pick the expression from the parent column list + int index = HiveConf.getPositionFromInternalName(column.getColumn()); + if (index == -1) { + throw new SemanticException(); + } + return pCols.get(index); + } else { + return column; + } + } + + if (cCol instanceof ExprNodeFieldDesc) { + // field epression should be resolved + ExprNodeFieldDesc field = (ExprNodeFieldDesc) cCol.clone(); + field.setDesc(merge(map, pCols, field.getDesc(), resolveColumn)); + return field; + } + // constant or null expr, just return + return cCol; + } + + private List merge(Map map, List pCols, + List cCols, boolean resolveColumn) throws SemanticException { + + List result = new ArrayList(); + + for (ExprNodeDesc cCol: cCols) { + result.add(merge(map, pCols, cCol, resolveColumn)); + } + return result; + } + // if any expression of child is referencing parent column which is result of function // twice or more, skip dedup. private boolean checkReferences(List sources, Set funcOutputs) { diff --git ql/src/test/results/clientpositive/union_remove_22.q.out ql/src/test/results/clientpositive/union_remove_22.q.out index ea76dfb..2c77827 100644 --- ql/src/test/results/clientpositive/union_remove_22.q.out +++ ql/src/test/results/clientpositive/union_remove_22.q.out @@ -109,24 +109,17 @@ STAGE PLANS: type: string expr: _col1 type: bigint - outputColumnNames: _col0, _col1 - Select Operator - expressions: - expr: _col0 - type: string - expr: _col1 - type: bigint - expr: _col1 - type: bigint - outputColumnNames: _col0, _col1, _col2 - File Output Operator - compressed: false - GlobalTableId: 1 - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 Stage: Stage-0 Move Operator @@ -186,24 +179,17 @@ STAGE PLANS: type: string expr: _col1 type: bigint - outputColumnNames: _col0, _col1 - Select Operator - expressions: - expr: _col0 - type: string - expr: _col1 - type: bigint - expr: _col1 - type: bigint - outputColumnNames: _col0, _col1, _col2 - File Output Operator - compressed: false - GlobalTableId: 1 - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 PREHOOK: query: insert overwrite table outputTbl1 @@ -359,35 +345,19 @@ STAGE PLANS: expressions: expr: _col0 type: string - expr: _col1 + expr: UDFToLong(concat(_col1, _col1)) type: bigint - outputColumnNames: _col0, _col1 - Select Operator - expressions: - expr: _col0 - type: string - expr: concat(_col1, _col1) - type: string - expr: concat(_col1, _col1) - type: string - outputColumnNames: _col0, _col1, _col2 - Select Operator - expressions: - expr: _col0 - type: string - expr: UDFToLong(_col1) - type: bigint - expr: UDFToLong(_col2) - type: bigint - outputColumnNames: _col0, _col1, _col2 - File Output Operator - compressed: false - GlobalTableId: 1 - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 + expr: UDFToLong(concat(_col1, _col1)) + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 Stage: Stage-0 Move Operator @@ -445,35 +415,19 @@ STAGE PLANS: expressions: expr: _col0 type: string - expr: _col1 + expr: UDFToLong(concat(_col1, _col1)) type: bigint - outputColumnNames: _col0, _col1 - Select Operator - expressions: - expr: _col0 - type: string - expr: concat(_col1, _col1) - type: string - expr: concat(_col1, _col1) - type: string - outputColumnNames: _col0, _col1, _col2 - Select Operator - expressions: - expr: _col0 - type: string - expr: UDFToLong(_col1) - type: bigint - expr: UDFToLong(_col2) - type: bigint - outputColumnNames: _col0, _col1, _col2 - File Output Operator - compressed: false - GlobalTableId: 1 - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 + expr: UDFToLong(concat(_col1, _col1)) + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 PREHOOK: query: insert overwrite table outputTbl1