diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 97b7048..26c6a22 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1031,6 +1031,7 @@ "Whether to push predicates down into storage handlers. Ignored when hive.optimize.ppd is false."), // Constant propagation optimizer HIVEOPTCONSTANTPROPAGATION("hive.optimize.constant.propagation", true, "Whether to enable constant propagation optimizer"), + HIVEIDENTITYPROJECTREMOVER("hive.optimize.remove.identity.project", true, "Removes identity project from operator tree"), HIVEMETADATAONLYQUERIES("hive.optimize.metadataonly", true, ""), HIVENULLSCANOPTIMIZE("hive.optimize.null.scan", true, "Dont scan relations which are guaranteed to not generate any rows"), HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java index 2bd40fa..95c33b8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java @@ -143,4 +143,8 @@ } } } + + public static boolean sameRowSchema(Operator operator1, Operator operator2) { + return operator1.getSchema().equals(operator2.getSchema()); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/RowSchema.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/RowSchema.java index 71cc7eb..bce5451 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/RowSchema.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/RowSchema.java @@ -20,6 +20,7 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.Iterator; /** * RowSchema Implementation. @@ -47,6 +48,51 @@ public ArrayList getSignature() { return signature; } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof RowSchema) || (obj == null)) { + return false; + } + if(this == obj) { + return true; + } + + RowSchema dest = (RowSchema)obj; + if(this.signature == null && dest.getSignature() == null) { + return true; + } + if((this.signature == null && dest.getSignature() != null) || + (this.signature != null && dest.getSignature() == null) ) { + return false; + } + + if(this.signature.size() != dest.getSignature().size()) { + return false; + } + + Iterator origIt = this.signature.iterator(); + Iterator destIt = dest.getSignature().iterator(); + while(origIt.hasNext()) { + ColumnInfo origColumn = origIt.next(); + ColumnInfo destColumn = destIt.next(); + + if(origColumn == null && destColumn == null) { + continue; + } + + if((origColumn == null && destColumn != null) || + (origColumn != null && destColumn == null) ) { + return false; + } + + if(!origColumn.equals(destColumn)) { + return false; + } + } + + return true; + } @Override public String toString() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java index 42b546b..0ed37b6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java @@ -125,4 +125,31 @@ public boolean acceptLimitPushdown() { return true; } + + /** + * Checks whether this select operator does something to the + * input tuples. + * + * @return if it is an identity select operator or not + */ + public boolean isIdentitySelect() { + //Safety check + if(this.getNumParent() != 1) { + return false; + } + + //Select * + if(this.getConf().isSelStarNoCompute() || + this.getConf().isSelectStar()) { + return true; + } + + //Check whether the have the same schema + if(!OperatorUtils.sameRowSchema(this, this.getParentOperators().get(0))) { + return false; + } + + return true; + } + } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java new file mode 100644 index 0000000..6585018 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/IdentityProjectRemover.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; + +/** This optimization tries to remove {@link SelectOperator} from tree which don't do any + * processing except forwarding columns from its parent to its children. + * e.g., select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; + * Query tree + * + * Without this optimization: + * + * TS -> FIL -> SEL -> RS -> + * JOIN -> SEL -> FS + * TS -> FIL -> SEL -> RS -> + * + * With this optimization + * + * TS -> FIL -> RS -> + * JOIN -> FS + * TS -> FIL -> RS -> + * + * Note absence of select operator after filter and after join operator. + * Also, see : identity_proj_remove.q + */ +public class IdentityProjectRemover implements Transform { + + private static final Log LOG = LogFactory.getLog(IdentityProjectRemover.class); + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", + "(" + SelectOperator.getOperatorName() + "%)"), new ProjectRemover()); + GraphWalker ogw = new DefaultGraphWalker(new DefaultRuleDispatcher(null, opRules, null)); + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + return pctx; + } + + private static class ProjectRemover implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + SelectOperator sel = (SelectOperator)nd; + List> parents = sel.getParentOperators(); + if (parents.size() != 1) { + // multi parents, cant handle that. + return null; + } + Operator parent = parents.get(0); + if(sel.isIdentitySelect()) { + parent.removeChildAndAdoptItsChildren(sel); + LOG.debug("Identity project remover optimization removed : " + sel); + } + return null; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 3e32558..fc1161a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -122,6 +122,9 @@ transformations.add(new ReduceSinkDeDuplication()); } transformations.add(new NonBlockingOpDeDupProc()); + if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEIDENTITYPROJECTREMOVER)) { + transformations.add(new IdentityProjectRemover()); + } if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVELIMITOPTENABLE)) { transformations.add(new GlobalLimitOptimizer()); } diff --git a/ql/src/test/queries/clientpositive/identity_proj_remove.q b/ql/src/test/queries/clientpositive/identity_proj_remove.q new file mode 100644 index 0000000..5e91a7f --- /dev/null +++ b/ql/src/test/queries/clientpositive/identity_proj_remove.q @@ -0,0 +1,9 @@ +set hive.optimize.remove.identity.project=false; +explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; + +set hive.optimize.remove.identity.project=true; +explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; + +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2; diff --git a/ql/src/test/results/clientpositive/identity_proj_remove.q.out b/ql/src/test/results/clientpositive/identity_proj_remove.q.out new file mode 100644 index 0000000..7784f15 --- /dev/null +++ b/ql/src/test/results/clientpositive/identity_proj_remove.q.out @@ -0,0 +1,137 @@ +Warning: Shuffle Join JOIN[8][tables = [t1, t2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[8][tables = [t1, t2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = value) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[8][tables = [t1, t2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select * from (select * from src where key = value) t1 join (select * from src where key = value) t2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here ####