commit f501be5f74da678862bffcc7dffd4a0e5ceb9a30 Author: Ashutosh Chauhan Date: Sun Oct 12 15:36:49 2014 -0700 HIVE-8435 : Add identity project remover optimization diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 6a1a5f0..dd2d8ee 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1026,6 +1026,7 @@ "Whether to push predicates down into storage handlers. Ignored when hive.optimize.ppd is false."), // Constant propagation optimizer HIVEOPTCONSTANTPROPAGATION("hive.optimize.constant.propagation", true, "Whether to enable constant propagation optimizer"), + HIVEIDENTITYPROJECTREMOVER("hive.optimize.remove.identity.project", true, "Removes identity project from operator tree"), HIVEMETADATAONLYQUERIES("hive.optimize.metadataonly", true, ""), HIVENULLSCANOPTIMIZE("hive.optimize.null.scan", true, "Dont scan relations which are guaranteed to not generate any rows"), HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java new file mode 100644 index 0000000..82fd9f8 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/** This optimization tries to remove {@link SelectOperator} from tree which don't do any + * processing except forwarding columns from its parent to its children. + * e.g., select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; + * Query tree + * + * Without this optimization: + * + * TS -> FIL -> SEL -> RS -> + * JOIN -> SEL -> FS + * TS -> FIL -> SEL -> RS -> + * + * With this optimization + * + * TS -> FIL -> RS -> + * JOIN -> FS + * TS -> FIL -> RS -> + * + * Note absence of select operator after filter and after join operator. + * Also, see : identity_proj_remove.q + */ +public class IdentityProjectRemover implements Transform { + + private static final Log LOG = LogFactory.getLog(IdentityProjectRemover.class); + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", + "(" + SelectOperator.getOperatorName() + "%)"), new ProjectRemover()); + GraphWalker ogw = new DefaultGraphWalker(new DefaultRuleDispatcher(null, opRules, null)); + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + return pctx; + } + + private static class ProjectRemover implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + SelectOperator sel = (SelectOperator)nd; + List> parents = sel.getParentOperators(); + if (parents.size() != 1) { + // multi parents, cant handle that. + return null; + } + Operator parent = parents.get(0); + RowSchema prs = parent.getSchema(); + ArrayList pcols; + List selCols = sel.getConf().getColList(); + if (selCols == null || null == prs || (pcols = prs.getSignature()) == null) { + // this is unlikely, but lets be defensive, this is an optimization after all. + return null; + } + if (selCols.size() != pcols.size()) { + return null; + } + + for (int i = 0; i < pcols.size(); i++) { + ExprNodeDesc desc = selCols.get(i); + if (!(desc instanceof ExprNodeColumnDesc)) { + // this needs to be a simple project + return null; + } + if (!(desc.getTypeInfo().equals(pcols.get(i).getType())) || + !(desc.getExprString().equals(pcols.get(i).getInternalName()))) { + // type and name should match + return null; + } + + } + // All looks good! + parent.removeChildAndAdoptItsChildren(sel); + LOG.debug("Identity project remover optimization removed : " + sel); + return null; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index bc4ad2f..4cd8a4a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -120,6 +120,9 @@ public void initialize(HiveConf hiveConf) { transformations.add(new ReduceSinkDeDuplication()); } transformations.add(new NonBlockingOpDeDupProc()); + if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEIDENTITYPROJECTREMOVER)) { + transformations.add(new IdentityProjectRemover()); + } if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVELIMITOPTENABLE)) { transformations.add(new GlobalLimitOptimizer()); } diff --git a/ql/src/test/queries/clientpositive/identity_proj_remove.q b/ql/src/test/queries/clientpositive/identity_proj_remove.q new file mode 100644 index 0000000..5e91a7f --- /dev/null +++ b/ql/src/test/queries/clientpositive/identity_proj_remove.q @@ -0,0 +1,9 @@ +set hive.optimize.remove.identity.project=false; +explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; + +set hive.optimize.remove.identity.project=true; +explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; + +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; diff --git a/ql/src/test/results/clientpositive/identity_proj_remove.q.out b/ql/src/test/results/clientpositive/identity_proj_remove.q.out new file mode 100644 index 0000000..7784f15 --- /dev/null +++ b/ql/src/test/results/clientpositive/identity_proj_remove.q.out @@ -0,0 +1,137 @@ +Warning: Shuffle Join JOIN[8][tables = [t1, t2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[8][tables = [t1, t2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[8][tables = [t1, t2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here ####