diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 273691e..d2b5c05 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -1093,8 +1093,9 @@ public boolean supportSkewJoinOptimization() { @SuppressWarnings("unchecked") T descClone = (T)conf.clone(); + // also clone the colExprMap by default Operator ret = - OperatorFactory.getAndMakeChild(descClone, getSchema(), parentClones); + OperatorFactory.getAndMakeChild(descClone, getSchema(), getColumnExprMap(), parentClones); return ret; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java index e87c41b..84fc1c1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java @@ -49,6 +49,7 @@ import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.RowResolver; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; @@ -164,12 +165,23 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } + // have to create a QBJoinTree for the cloned join operator + QBJoinTree originJoinTree = parseContext.getJoinContext().get(joinOp); + QBJoinTree newJoinTree; + try { + newJoinTree = originJoinTree.clone(); + } catch (CloneNotSupportedException e) { + LOG.debug("QBJoinTree could not be cloned: ", e); + return null; + } + JoinOperator joinOpClone; if (processSelect) { joinOpClone = (JoinOperator)(currOpClone.getParentOperators().get(0)); } else { joinOpClone = (JoinOperator)currOpClone; } + parseContext.getJoinContext().put(joinOpClone, newJoinTree); List tableScanCloneOpsForJoin = new ArrayList(); @@ -201,6 +213,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } parseContext.getTopOps().put(newAlias, tso); + setUpAlias(originJoinTree, newJoinTree, tabAlias, newAlias, tso); } // Now do a union of the select operators: selectOp and selectOpClone @@ -610,6 +623,48 @@ private void insertRowResolvers( } } } + + /** + * Set alias in the cloned join tree + */ + private static void setUpAlias(QBJoinTree origin, QBJoinTree cloned, String origAlias, + String newAlias, Operator topOp) { + cloned.getAliasToOpInfo().remove(origAlias); + cloned.getAliasToOpInfo().put(newAlias, topOp); + if (origin.getLeftAlias().equals(origAlias)) { + cloned.setLeftAlias(null); + cloned.setLeftAlias(newAlias); + } + replaceAlias(origin.getLeftAliases(), cloned.getLeftAliases(), origAlias, newAlias); + replaceAlias(origin.getRightAliases(), cloned.getRightAliases(), origAlias, newAlias); + replaceAlias(origin.getBaseSrc(), cloned.getBaseSrc(), origAlias, newAlias); + replaceAlias(origin.getMapAliases(), cloned.getMapAliases(), origAlias, newAlias); + replaceAlias(origin.getStreamAliases(), cloned.getStreamAliases(), origAlias, newAlias); + } + + private static void replaceAlias(String[] origin, String[] cloned, + String alias, String newAlias) { + if (origin == null || cloned == null || origin.length != cloned.length) { + return; + } + for (int i = 0; i < origin.length; i++) { + if (origin[i].equals(alias)) { + cloned[i] = newAlias; + } + } + } + + private static void replaceAlias(List origin, List cloned, + String alias, String newAlias) { + if (origin == null || cloned == null || origin.size() != cloned.size()) { + return; + } + for (int i = 0; i < origin.size(); i++) { + if (origin.get(i).equals(alias)) { + cloned.set(i, newAlias); + } + } + } } /* (non-Javadoc) diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/JoinCond.java ql/src/java/org/apache/hadoop/hive/ql/parse/JoinCond.java index 96df830..bb8526f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/JoinCond.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/JoinCond.java @@ -79,4 +79,7 @@ public void setJoinType(final JoinType joinType) { this.joinType = joinType; } + public void setPreserved(boolean preserved) { + this.preserved = preserved; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java index 94c563f..a3e95ce 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java @@ -32,7 +32,7 @@ * Internal representation of the join tree. * */ -public class QBJoinTree implements Serializable{ +public class QBJoinTree implements Serializable, Cloneable { private static final long serialVersionUID = 1L; private String leftAlias; private String[] rightAliases; @@ -363,4 +363,70 @@ public void addPostJoinFilter(ASTNode filter) { public List getPostJoinFilters() { return postJoinFilters; } + + @Override + public QBJoinTree clone() throws CloneNotSupportedException { + QBJoinTree cloned = new QBJoinTree(); + + // shallow copy aliasToOpInfo, we won't want to clone the operator tree here + cloned.setAliasToOpInfo(aliasToOpInfo == null ? null : + new HashMap>(aliasToOpInfo)); + + cloned.setBaseSrc(baseSrc == null ? null : baseSrc.clone()); + + // shallow copy ASTNode + cloned.setExpressions(expressions); + cloned.setFilters(filters); + cloned.setFiltersForPushing(filtersForPushing); + + // clone filterMap + int[][] clonedFilterMap = filterMap == null ? null : new int[filterMap.length][]; + if (filterMap != null) { + for (int i = 0; i < filterMap.length; i++) { + clonedFilterMap[i] = filterMap[i] == null ? null : filterMap[i].clone(); + } + } + cloned.setFilterMap(clonedFilterMap); + + cloned.setId(id); + + // clone joinCond + JoinCond[] clonedJoinCond = joinCond == null ? null : new JoinCond[joinCond.length]; + if (joinCond != null) { + for (int i = 0; i < joinCond.length; i++) { + if(joinCond[i] == null) { + continue; + } + JoinCond clonedCond = new JoinCond(); + clonedCond.setJoinType(joinCond[i].getJoinType()); + clonedCond.setLeft(joinCond[i].getLeft()); + clonedCond.setPreserved(joinCond[i].getPreserved()); + clonedCond.setRight(joinCond[i].getRight()); + clonedJoinCond[i] = clonedCond; + } + } + cloned.setJoinCond(clonedJoinCond); + + cloned.setJoinSrc(joinSrc == null ? null : joinSrc.clone()); + cloned.setLeftAlias(leftAlias); + cloned.setLeftAliases(leftAliases == null ? null : leftAliases.clone()); + cloned.setMapAliases(mapAliases == null ? null : new ArrayList(mapAliases)); + cloned.setMapSideJoin(mapSideJoin); + cloned.setNoOuterJoin(noOuterJoin); + cloned.setNoSemiJoin(noSemiJoin); + cloned.setNullSafes(nullsafes == null ? null : new ArrayList(nullsafes)); + cloned.setRightAliases(rightAliases == null ? null : rightAliases.clone()); + cloned.setStreamAliases(streamAliases == null ? null : new ArrayList(streamAliases)); + + // clone postJoinFilters + for (ASTNode filter : postJoinFilters) { + cloned.getPostJoinFilters().add(filter); + } + // clone rhsSemijoin + for (Entry> entry : rhsSemijoin.entrySet()) { + cloned.addRHSSemijoinColumns(entry.getKey(), entry.getValue()); + } + + return cloned; + } } diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin1.q ql/src/test/queries/clientpositive/skewjoin_mapjoin1.q new file mode 100644 index 0000000..3986060 --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin1.q @@ -0,0 +1,44 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +-- copy from skewjoinopt1 +-- test compile time skew join and auto map join +-- a simple join query with skew on both the tables on the join key +-- adding an order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; + +-- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; + +-- an aggregation at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key; + +EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; + +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin10.q ql/src/test/queries/clientpositive/skewjoin_mapjoin10.q new file mode 100644 index 0000000..69070d4 --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin10.q @@ -0,0 +1,53 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE tmpT1; + +-- testing skew on other data types - int +CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2)); +INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1; + +CREATE TABLE tmpT2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE tmpT2; + +CREATE TABLE T2(key INT, val STRING) SKEWED BY (key) ON ((3)); + +INSERT OVERWRITE TABLE T2 SELECT key, val FROM tmpT2; + +-- copy from skewjoinopt15 +-- test compile time skew join and auto map join +-- The skewed key is a integer column. +-- Otherwise this test is similar to skewjoinopt1.q +-- Both the joined tables are skewed, and the joined column +-- is an integer +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; + +-- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; + +-- an aggregation at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key; + +EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; + +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin11.q ql/src/test/queries/clientpositive/skewjoin_mapjoin11.q new file mode 100644 index 0000000..d907f07 --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin11.q @@ -0,0 +1,26 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) INTO 4 BUCKETS +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +-- copy from skewjoinopt19 +-- test compile time skew join and auto map join +-- add a test where the skewed key is also the bucketized key +-- it should not matter, and the compile time skewed join +-- optimization is performed +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin2.q ql/src/test/queries/clientpositive/skewjoin_mapjoin2.q new file mode 100644 index 0000000..8cb774c --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin2.q @@ -0,0 +1,34 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +-- copy from skewjoinopt3 +-- test compile time skew join and auto map join +-- a simple query with skew on both the tables. One of the skewed +-- value is common to both the tables. The skewed value should not be +-- repeated in the filter. +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; + +-- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin3.q ql/src/test/queries/clientpositive/skewjoin_mapjoin3.q new file mode 100644 index 0000000..be97bce --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin3.q @@ -0,0 +1,26 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +-- copy from skewjoinopt6 +-- test compile time skew join and auto map join +-- Both the join tables are skewed by 2 keys, and one of the skewed values +-- is common to both the tables. The join key is a subset of the skewed key set: +-- it only contains the first skewed key for both the tables +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin4.q ql/src/test/queries/clientpositive/skewjoin_mapjoin4.q new file mode 100644 index 0000000..bad79cc --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin4.q @@ -0,0 +1,30 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3; + +-- copy from skewjoinopt7 +-- test compile time skew join and auto map join +-- This test is for validating skewed join compile time optimization for more than +-- 2 tables. The join key is the same, and so a 3-way join would be performed. +-- 2 of the 3 tables are skewed on the join key +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key; + +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +ORDER BY a.key, b.key, c.key, a.val, b.val, c.val; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin5.q ql/src/test/queries/clientpositive/skewjoin_mapjoin5.q new file mode 100644 index 0000000..f7b6023 --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin5.q @@ -0,0 +1,51 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +-- copy from skewjoinopt9 +-- test compile time skew join and auto map join +-- no skew join compile time optimization would be performed if one of the +-- join sources is a sub-query consisting of a union all +-- adding a order by at the end to make the results deterministic +EXPLAIN +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key; + +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +ORDER BY subq1.key, b.key, subq1.val, b.val; + +-- no skew join compile time optimization would be performed if one of the +-- join sources is a sub-query consisting of a group by +EXPLAIN +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key; + +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +ORDER BY subq1.key, b.key, subq1.cnt, b.val; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin6.q ql/src/test/queries/clientpositive/skewjoin_mapjoin6.q new file mode 100644 index 0000000..a1134fe --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin6.q @@ -0,0 +1,22 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +drop table array_valued_T1; +create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8)); +insert overwrite table array_valued_T1 select key, array(value) from T1; + +-- copy from skewjoinopt10 +-- test compile time skew join and auto map join +-- This test is to verify the skew join compile optimization when the join is followed by a lateral view +-- adding a order by at the end to make the results deterministic + +explain +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val; + +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +ORDER BY key, val; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin7.q ql/src/test/queries/clientpositive/skewjoin_mapjoin7.q new file mode 100644 index 0000000..5b33b07 --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin7.q @@ -0,0 +1,35 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +-- copy from skewjoinopt11 +-- test compile time skew join and auto map join +-- This test is to verify the skew join compile optimization when the join is followed +-- by a union. Both sides of a union consist of a join, which should have used +-- skew join compile time optimization. +-- adding an order by at the end to make the results deterministic + +EXPLAIN +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1; + +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +ORDER BY key, val1, val2; diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin8.q ql/src/test/queries/clientpositive/skewjoin_mapjoin8.q new file mode 100644 index 0000000..fc4f9ca --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin8.q @@ -0,0 +1,38 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3; + +-- copy from skewjoinopt13 +-- test compile time skew join and auto map join +-- This test is for skewed join compile time optimization for more than 2 tables. +-- The join key for table 3 is different from the join key used for joining +-- tables 1 and 2. Table 3 is skewed, but since one of the join sources for table +-- 3 consist of a sub-query which contains a join, the compile time skew join +-- optimization is not performed +-- adding a order by at the end to make the results deterministic + +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val; + +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +order by a.key, b.key, c.key, a.val, b.val, c.val; + diff --git ql/src/test/queries/clientpositive/skewjoin_mapjoin9.q ql/src/test/queries/clientpositive/skewjoin_mapjoin9.q new file mode 100644 index 0000000..3b76c22 --- /dev/null +++ ql/src/test/queries/clientpositive/skewjoin_mapjoin9.q @@ -0,0 +1,40 @@ +set hive.mapred.supports.subdirectories=true; +set hive.optimize.skewjoin.compiletime = true; +set hive.auto.convert.join=true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3; + +-- copy from skewjoinopt14 +-- test compile time skew join and auto map join +-- This test is for skewed join compile time optimization for more than 2 tables. +-- The join key for table 3 is different from the join key used for joining +-- tables 1 and 2. Tables 1 and 3 are skewed. Since one of the join sources for table +-- 3 consist of a sub-query which contains a join, the compile time skew join +-- optimization is not enabled for table 3, but it is used for the first join between +-- tables 1 and 2 +-- adding a order by at the end to make the results deterministic + +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val; + +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +order by a.key, b.key, a.val, b.val; + diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin1.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin1.q.out new file mode 100644 index 0000000..b70d2a8 --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin1.q.out @@ -0,0 +1,660 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: -- copy from skewjoinopt1 +-- test compile time skew join and auto map join +-- a simple join query with skew on both the tables on the join key +-- adding an order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt1 +-- test compile time skew join and auto map join +-- a simple join query with skew on both the tables on the join key +-- adding an order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = '2') or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = '2') or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: ((key = '2') or (key = '3')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (not ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (not ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: ((key = '2') or (key = '3')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +NULL NULL 4 14 +NULL NULL 5 15 +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- an aggregation at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- an aggregation at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = '2') or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = '2') or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +6 +PREHOOK: query: EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: ((key = '2') or (key = '3')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (not ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (not ((key = '2') or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: ((key = '2') or (key = '3')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +8 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin10.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin10.q.out new file mode 100644 index 0000000..3fc88cf --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin10.q.out @@ -0,0 +1,700 @@ +PREHOOK: query: CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tmpT1 +POSTHOOK: query: CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tmpT1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE tmpT1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@tmpt1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE tmpT1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@tmpt1 +PREHOOK: query: -- testing skew on other data types - int +CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: -- testing skew on other data types - int +CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1 +PREHOOK: type: QUERY +PREHOOK: Input: default@tmpt1 +PREHOOK: Output: default@t1 +POSTHOOK: query: INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmpt1 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE tmpT2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tmpT2 +POSTHOOK: query: CREATE TABLE tmpT2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tmpT2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE tmpT2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@tmpt2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE tmpT2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@tmpt2 +PREHOOK: query: CREATE TABLE T2(key INT, val STRING) SKEWED BY (key) ON ((3)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key INT, val STRING) SKEWED BY (key) ON ((3)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: INSERT OVERWRITE TABLE T2 SELECT key, val FROM tmpT2 +PREHOOK: type: QUERY +PREHOOK: Input: default@tmpt2 +PREHOOK: Output: default@t2 +POSTHOOK: query: INSERT OVERWRITE TABLE T2 SELECT key, val FROM tmpT2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmpt2 +POSTHOOK: Output: default@t2 +POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: -- copy from skewjoinopt15 +-- test compile time skew join and auto map join +-- The skewed key is a integer column. +-- Otherwise this test is similar to skewjoinopt1.q +-- Both the joined tables are skewed, and the joined column +-- is an integer +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt15 +-- test compile time skew join and auto map join +-- The skewed key is a integer column. +-- Otherwise this test is similar to skewjoinopt1.q +-- Both the joined tables are skewed, and the joined column +-- is an integer +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = 2) or (key = 3)))) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = 2) or (key = 3)))) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col5 (type: int), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col5 (type: int), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key = 2) or (key = 3)) (type: boolean) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (not ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (not ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col5 (type: int), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key = 2) or (key = 3)) (type: boolean) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col5 (type: int), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +NULL NULL 4 14 +NULL NULL 5 15 +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- an aggregation at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- an aggregation at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = 2) or (key = 3)))) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and (not ((key = 2) or (key = 3)))) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +6 +PREHOOK: query: EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key = 2) or (key = 3)) (type: boolean) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (not ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (not ((key = 2) or (key = 3))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key = 2) or (key = 3)) (type: boolean) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 6 Data size: 26 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +8 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin11.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin11.q.out new file mode 100644 index 0000000..ea180ec --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin11.q.out @@ -0,0 +1,197 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) INTO 4 BUCKETS +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +CLUSTERED BY (key) INTO 4 BUCKETS +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: -- copy from skewjoinopt19 +-- test compile time skew join and auto map join +-- add a test where the skewed key is also the bucketized key +-- it should not matter, and the compile time skewed join +-- optimization is performed +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt19 +-- test compile time skew join and auto map join +-- add a test where the skewed key is also the bucketized key +-- it should not matter, and the compile time skewed join +-- optimization is performed +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin2.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin2.q.out new file mode 100644 index 0000000..6b66d6c --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin2.q.out @@ -0,0 +1,366 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: -- copy from skewjoinopt3 +-- test compile time skew join and auto map join +-- a simple query with skew on both the tables. One of the skewed +-- value is common to both the tables. The skewed value should not be +-- repeated in the filter. +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt3 +-- test compile time skew join and auto map join +-- a simple query with skew on both the tables. One of the skewed +-- value is common to both the tables. The skewed value should not be +-- repeated in the filter. +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (((key = '2') or (key = '8')) or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (((key = '2') or (key = '8')) or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (not (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: key (type: string) + sort order: + + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: val (type: string) + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (not (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: key (type: string) + sort order: + + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: val (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {VALUE._col0} + 1 {KEY.reducesinkkey0} {VALUE._col0} + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (((key = '2') or (key = '8')) or (key = '3')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: key (type: string) + sort order: + + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: val (type: string) + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (((key = '2') or (key = '8')) or (key = '3')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: key (type: string) + sort order: + + Map-reduce partition columns: key (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: val (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {VALUE._col0} + 1 {KEY.reducesinkkey0} {VALUE._col0} + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +NULL NULL 4 14 +NULL NULL 5 15 +1 11 NULL NULL +2 12 2 22 +3 13 3 13 +7 17 NULL NULL +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin3.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin3.q.out new file mode 100644 index 0000000..bd2c108 --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin3.q.out @@ -0,0 +1,197 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: -- copy from skewjoinopt6 +-- test compile time skew join and auto map join +-- Both the join tables are skewed by 2 keys, and one of the skewed values +-- is common to both the tables. The join key is a subset of the skewed key set: +-- it only contains the first skewed key for both the tables +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt6 +-- test compile time skew join and auto map join +-- Both the join tables are skewed by 2 keys, and one of the skewed values +-- is common to both the tables. The join key is a subset of the skewed key set: +-- it only contains the first skewed key for both the tables +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (((key = '2') or (key = '8')) or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (((key = '2') or (key = '8')) or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +ORDER BY a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin4.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin4.q.out new file mode 100644 index 0000000..ebe238e --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin4.q.out @@ -0,0 +1,258 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T3 +POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t3 +PREHOOK: query: -- copy from skewjoinopt7 +-- test compile time skew join and auto map join +-- This test is for validating skewed join compile time optimization for more than +-- 2 tables. The join key is the same, and so a 3-way join would be performed. +-- 2 of the 3 tables are skewed on the join key +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt7 +-- test compile time skew join and auto map join +-- This test is for validating skewed join compile time optimization for more than +-- 2 tables. The join key is the same, and so a 3-way join would be performed. +-- 2 of the 3 tables are skewed on the join key +-- adding a order by at the end to make the results deterministic + +EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-10 is a root stage + Stage-2 depends on stages: Stage-10 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-10 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + c + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + subquery1:c + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + 2 {val} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + c + TableScan + alias: c + Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + 2 {val} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (((key = '2') or (key = '8')) or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + 2 {val} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + subquery1:c + TableScan + alias: c + Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (((key = '2') or (key = '8')) or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + 2 {val} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (((key = '2') or (key = '8')) or (key = '3')))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {key} {val} + 1 {key} {val} + 2 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (((key = '2') or (key = '8')) or (key = '3'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {key} {val} + 1 {key} {val} + 2 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +ORDER BY a.key, b.key, c.key, a.val, b.val, c.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +ORDER BY a.key, b.key, c.key, a.val, b.val, c.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +#### A masked pattern was here #### +2 12 2 22 2 12 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin5.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin5.q.out new file mode 100644 index 0000000..19f225d --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin5.q.out @@ -0,0 +1,355 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: -- copy from skewjoinopt9 +-- test compile time skew join and auto map join +-- no skew join compile time optimization would be performed if one of the +-- join sources is a sub-query consisting of a union all +-- adding a order by at the end to make the results deterministic +EXPLAIN +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt9 +-- test compile time skew join and auto map join +-- no skew join compile time optimization would be performed if one of the +-- join sources is a sub-query consisting of a union all +-- adding a order by at the end to make the results deterministic +EXPLAIN +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-5 is a root stage + Stage-4 depends on stages: Stage-5 + Stage-0 depends on stages: Stage-4 + +STAGE PLANS: + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {_col0} {_col1} + 1 {val} + keys: + 0 _col0 (type: string) + 1 key (type: string) + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: key (type: string), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {key} {val} + keys: + 0 _col0 (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: t1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: key (type: string), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {key} {val} + keys: + 0 _col0 (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +ORDER BY subq1.key, b.key, subq1.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +ORDER BY subq1.key, b.key, subq1.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +2 12 2 22 +3 13 3 13 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- no skew join compile time optimization would be performed if one of the +-- join sources is a sub-query consisting of a group by +EXPLAIN +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- no skew join compile time optimization would be performed if one of the +-- join sources is a sub-query consisting of a group by +EXPLAIN +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-5 depends on stages: Stage-1 + Stage-4 depends on stages: Stage-5 + Stage-0 depends on stages: Stage-4 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-5 + Map Reduce Local Work + Alias -> Map Local Tables: + b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + b + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {_col0} {_col1} + 1 {val} + keys: + 0 _col0 (type: string) + 1 key (type: string) + + Stage: Stage-4 + Map Reduce + Map Operator Tree: + TableScan + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} + 1 {key} {val} + keys: + 0 _col0 (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +ORDER BY subq1.key, b.key, subq1.cnt, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +ORDER BY subq1.key, b.key, subq1.cnt, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 1 2 22 +3 1 3 13 +8 2 8 18 +8 2 8 18 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin6.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin6.q.out new file mode 100644 index 0000000..9403c4f --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin6.q.out @@ -0,0 +1,267 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: drop table array_valued_T1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table array_valued_T1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@array_valued_T1 +POSTHOOK: query: create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@array_valued_T1 +PREHOOK: query: insert overwrite table array_valued_T1 select key, array(value) from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@array_valued_t1 +POSTHOOK: query: insert overwrite table array_valued_T1 select key, array(value) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@array_valued_t1 +POSTHOOK: Lineage: array_valued_t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: array_valued_t1.value EXPRESSION [(t1)t1.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: -- copy from skewjoinopt10 +-- test compile time skew join and auto map join +-- This test is to verify the skew join compile optimization when the join is followed by a lateral view +-- adding a order by at the end to make the results deterministic + +explain +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt10 +-- test compile time skew join and auto map join +-- This test is to verify the skew join compile optimization when the join is followed by a lateral view +-- adding a order by at the end to make the results deterministic + +explain +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-8 is a root stage + Stage-2 depends on stages: Stage-8 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-8 + Map Reduce Local Work + Alias -> Map Local Tables: + i:a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + i:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '8')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '8'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '8'))) (type: boolean) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col6 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col6 (type: array) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Lateral View Forward + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + expressions: _col0 (type: string), _col1 (type: array) + outputColumnNames: org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc, org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: array), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Select Operator + expressions: _col1 (type: array) + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + function name: explode + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: array), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '8')) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col6 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col6 (type: array) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Lateral View Forward + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + SELECT * : (no compute) + expressions: _col0 (type: string), _col1 (type: array) + outputColumnNames: org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc, org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: array), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Select Operator + expressions: _col1 (type: array) + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + function name: explode + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: array), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +ORDER BY key, val +PREHOOK: type: QUERY +PREHOOK: Input: default@array_valued_t1 +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +ORDER BY key, val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@array_valued_t1 +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +1 ["11"] 11 +2 ["12"] 12 +3 ["13"] 13 +7 ["17"] 17 +8 ["18"] 18 +8 ["18"] 18 +8 ["28"] 28 +8 ["28"] 28 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin7.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin7.q.out new file mode 100644 index 0000000..1b64a24 --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin7.q.out @@ -0,0 +1,363 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: -- copy from skewjoinopt11 +-- test compile time skew join and auto map join +-- This test is to verify the skew join compile optimization when the join is followed +-- by a union. Both sides of a union consist of a join, which should have used +-- skew join compile time optimization. +-- adding an order by at the end to make the results deterministic + +EXPLAIN +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt11 +-- test compile time skew join and auto map join +-- This test is to verify the skew join compile optimization when the join is followed +-- by a union. Both sides of a union consist of a join, which should have used +-- skew join compile time optimization. +-- adding an order by at the end to make the results deterministic + +EXPLAIN +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-16 is a root stage + Stage-2 depends on stages: Stage-16 + Stage-3 depends on stages: Stage-2, Stage-8 + Stage-17 is a root stage + Stage-8 depends on stages: Stage-17 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-16 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:subq1-subquery2:a + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:subq1-subquery2:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-17 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery1:subq1-subquery1:a + Fetch Operator + limit: -1 + subquery2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery1:subq1-subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + subquery2:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-8 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +ORDER BY key, val1, val2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +ORDER BY key, val1, val2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 22 +2 12 22 +3 13 13 +3 13 13 +8 18 18 +8 18 18 +8 18 18 +8 18 18 +8 28 18 +8 28 18 +8 28 18 +8 28 18 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin8.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin8.q.out new file mode 100644 index 0000000..54f9706 --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin8.q.out @@ -0,0 +1,197 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T3 +POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t3 +PREHOOK: query: -- copy from skewjoinopt13 +-- test compile time skew join and auto map join +-- This test is for skewed join compile time optimization for more than 2 tables. +-- The join key for table 3 is different from the join key used for joining +-- tables 1 and 2. Table 3 is skewed, but since one of the join sources for table +-- 3 consist of a sub-query which contains a join, the compile time skew join +-- optimization is not performed +-- adding a order by at the end to make the results deterministic + +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt13 +-- test compile time skew join and auto map join +-- This test is for skewed join compile time optimization for more than 2 tables. +-- The join key for table 3 is different from the join key used for joining +-- tables 1 and 2. Table 3 is skewed, but since one of the join sources for table +-- 3 consist of a sub-query which contains a join, the compile time skew join +-- optimization is not performed +-- adding a order by at the end to make the results deterministic + +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-7 is a root stage + Stage-5 depends on stages: Stage-7 + Stage-0 depends on stages: Stage-5 + +STAGE PLANS: + Stage: Stage-7 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + c + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and val is not null) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + c + TableScan + alias: c + Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: val is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {_col0} {_col1} {_col5} {_col6} + 1 {key} + keys: + 0 _col1 (type: string) + 1 val (type: string) + + Stage: Stage-5 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} {_col5} {_col6} + 1 {key} {val} + keys: + 0 _col1 (type: string) + 1 val (type: string) + outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +order by a.key, b.key, c.key, a.val, b.val, c.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +#### A masked pattern was here #### +POSTHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +order by a.key, b.key, c.key, a.val, b.val, c.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +#### A masked pattern was here #### +2 12 2 22 2 12 diff --git ql/src/test/results/clientpositive/skewjoin_mapjoin9.q.out ql/src/test/results/clientpositive/skewjoin_mapjoin9.q.out new file mode 100644 index 0000000..d4b8354 --- /dev/null +++ ql/src/test/results/clientpositive/skewjoin_mapjoin9.q.out @@ -0,0 +1,267 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T1 +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T2 +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@T3 +POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/T3.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t3 +PREHOOK: query: -- copy from skewjoinopt14 +-- test compile time skew join and auto map join +-- This test is for skewed join compile time optimization for more than 2 tables. +-- The join key for table 3 is different from the join key used for joining +-- tables 1 and 2. Tables 1 and 3 are skewed. Since one of the join sources for table +-- 3 consist of a sub-query which contains a join, the compile time skew join +-- optimization is not enabled for table 3, but it is used for the first join between +-- tables 1 and 2 +-- adding a order by at the end to make the results deterministic + +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +PREHOOK: type: QUERY +POSTHOOK: query: -- copy from skewjoinopt14 +-- test compile time skew join and auto map join +-- This test is for skewed join compile time optimization for more than 2 tables. +-- The join key for table 3 is different from the join key used for joining +-- tables 1 and 2. Tables 1 and 3 are skewed. Since one of the join sources for table +-- 3 consist of a sub-query which contains a join, the compile time skew join +-- optimization is not enabled for table 3, but it is used for the first join between +-- tables 1 and 2 +-- adding a order by at the end to make the results deterministic + +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-10 is a root stage + Stage-7 depends on stages: Stage-10 + Stage-0 depends on stages: Stage-7 + +STAGE PLANS: + Stage: Stage-10 + Map Reduce Local Work + Alias -> Map Local Tables: + a + Fetch Operator + limit: -1 + c + Fetch Operator + limit: -1 + subquery1:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: ((key is not null and val is not null) and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + c + TableScan + alias: c + Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: val is not null (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {_col0} {_col1} {_col5} {_col6} + 1 {key} + keys: + 0 _col1 (type: string) + 1 val (type: string) + subquery1:a + TableScan + alias: a + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: ((key is not null and val is not null) and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + HashTable Sink Operator + condition expressions: + 0 {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + + Stage: Stage-7 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (not (key = '2'))) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} {_col5} {_col6} + 1 {key} {val} + keys: + 0 _col1 (type: string) + 1 val (type: string) + outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: b + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (key is not null and (key = '2')) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {val} + 1 {key} {val} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Union + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + SELECT * : (no compute) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {_col0} {_col1} {_col5} {_col6} + 1 {key} {val} + keys: + 0 _col1 (type: string) + 1 val (type: string) + outputColumnNames: _col0, _col1, _col5, _col6, _col10, _col11 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string), _col10 (type: string), _col11 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +order by a.key, b.key, a.val, b.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +#### A masked pattern was here #### +POSTHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +order by a.key, b.key, a.val, b.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +#### A masked pattern was here #### +2 12 2 22 2 12