diff --git data/conf/spark/hive-site.xml data/conf/spark/hive-site.xml
index 44eac86..ee484d1 100644
--- data/conf/spark/hive-site.xml
+++ data/conf/spark/hive-site.xml
@@ -162,7 +162,7 @@
hive.ignore.mapjoin.hint
- true
+ false
Whether Hive ignores the mapjoin hint
diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties
index d6f8267..01a4a32 100644
--- itests/src/test/resources/testconfiguration.properties
+++ itests/src/test/resources/testconfiguration.properties
@@ -505,7 +505,6 @@ spark.query.files=add_part_multiple.q, \
auto_sortmerge_join_8.q, \
auto_sortmerge_join_9.q, \
auto_sortmerge_join_10.q, \
- auto_sortmerge_join_11.q, \
auto_sortmerge_join_12.q, \
auto_sortmerge_join_13.q, \
auto_sortmerge_join_14.q, \
@@ -523,7 +522,6 @@ spark.query.files=add_part_multiple.q, \
bucketmapjoin3.q, \
bucketmapjoin4.q, \
bucketmapjoin5.q, \
- bucketmapjoin6.q, \
bucketmapjoin7.q, \
bucketmapjoin8.q, \
bucketmapjoin9.q, \
@@ -663,13 +661,11 @@ spark.query.files=add_part_multiple.q, \
join_cond_pushdown_unqual3.q, \
join_cond_pushdown_unqual4.q, \
join_empty.q \
- join_filters.q, \
join_filters_overlap.q, \
join_hive_626.q, \
join_map_ppr.q, \
join_merge_multi_expressions.q, \
join_merging.q, \
- join_nulls.q, \
join_rc.q, \
join_reorder.q, \
join_reorder2.q, \
@@ -799,21 +795,6 @@ spark.query.files=add_part_multiple.q, \
skewjoin_noskew.q, \
skewjoin_union_remove_1.q, \
skewjoin_union_remove_2.q, \
- smb_mapjoin9.q, \
- smb_mapjoin_1.q, \
- smb_mapjoin_2.q, \
- smb_mapjoin_3.q, \
- smb_mapjoin_4.q, \
- smb_mapjoin_5.q, \
- smb_mapjoin_6.q, \
- smb_mapjoin_7.q, \
- smb_mapjoin_8.q, \
- smb_mapjoin_10.q, \
- smb_mapjoin_13.q, \
- smb_mapjoin_14.q, \
- smb_mapjoin_15.q, \
- smb_mapjoin_16.q, \
- smb_mapjoin_17.q, \
smb_mapjoin_18.q, \
smb_mapjoin_19.q, \
smb_mapjoin_20.q, \
@@ -821,14 +802,6 @@ spark.query.files=add_part_multiple.q, \
smb_mapjoin_22.q, \
smb_mapjoin_25.q, \
sort.q, \
- sort_merge_join_desc_1.q, \
- sort_merge_join_desc_2.q, \
- sort_merge_join_desc_3.q, \
- sort_merge_join_desc_4.q, \
- sort_merge_join_desc_5.q, \
- sort_merge_join_desc_6.q, \
- sort_merge_join_desc_7.q, \
- sort_merge_join_desc_8.q, \
spark_test.q, \
stats_counter.q, \
stats_counter_partitioned.q, \
@@ -942,7 +915,6 @@ spark.query.files=add_part_multiple.q, \
vectorization_part_project.q, \
vectorization_pushdown.q, \
vectorization_short_regress.q, \
- vectorized_bucketmapjoin1.q, \
vectorized_case.q, \
vectorized_mapjoin.q, \
vectorized_math_funcs.q, \
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
index 773c827..1f65a23 100644
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
@@ -29,6 +29,7 @@
import java.util.Set;
import java.util.Stack;
+import com.clearspring.analytics.util.Preconditions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
@@ -291,6 +292,56 @@ private static void validateMapJoinTypes(Operator extends OperatorDesc> op)
}
/**
+ * convert a regular join to a a map-side join, for spark branch
+ *
+ * @param opParseCtxMap
+ * @param op join operator
+ * @param joinTree qb join tree
+ * @param mapJoinPos position of the source to be read as part of
+ * map-reduce framework. All other sources are cached in memory
+ * @param noCheckOuterJoin
+ */
+ public static MapJoinOperator convertMapJoinForSpark(HiveConf conf,
+ LinkedHashMap, OpParseContext> opParseCtxMap,
+ JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin,
+ boolean validateMapJoinTree) throws SemanticException {
+
+ // outer join cannot be performed on a table which is being cached
+ JoinCondDesc[] condns = op.getConf().getConds();
+
+ if (!noCheckOuterJoin) {
+ if (checkMapJoin(mapJoinPos, condns) < 0) {
+ throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
+ }
+ }
+
+ // create the map-join operator
+ MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, opParseCtxMap,
+ op, joinTree, mapJoinPos, noCheckOuterJoin);
+
+ List> parentOps =
+ new ArrayList>(mapJoinOp.getParentOperators());
+ for (int i = 0; i < parentOps.size(); i++) {
+ Operator extends OperatorDesc> parentOp = parentOps.get(i);
+ parentOp.getChildOperators().remove(op);
+ if (i == mapJoinPos) {
+ List> grandParentOps = parentOp.getParentOperators();
+ Preconditions.checkArgument(grandParentOps.size() == 1,
+ "AssertionError: expect number of parents to be 0, but was " + grandParentOps.size());
+ grandParentOps.get(0).replaceChild(parentOp, mapJoinOp);
+ mapJoinOp.replaceParent(parentOp, grandParentOps.get(0));
+ }
+ }
+
+ // make sure only map-joins can be performed.
+ if (validateMapJoinTree) {
+ validateMapJoinTypes(mapJoinOp);
+ }
+
+ return mapJoinOp;
+ }
+
+ /**
* convert a regular join to a a map-side join.
*
* @param opParseCtxMap
@@ -497,8 +548,14 @@ public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator o
LinkedHashMap, OpParseContext> opParseCtxMap = pctx
.getOpParseCtx();
- MapJoinOperator mapJoinOp = convertMapJoin(pctx.getConf(), opParseCtxMap, op,
- joinTree, mapJoinPos, noCheckOuterJoin, true);
+ MapJoinOperator mapJoinOp;
+ if (HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
+ mapJoinOp = convertMapJoinForSpark(pctx.getConf(), opParseCtxMap, op,
+ joinTree, mapJoinPos, noCheckOuterJoin, false);
+ } else {
+ mapJoinOp = convertMapJoin(pctx.getConf(), opParseCtxMap, op,
+ joinTree, mapJoinPos, noCheckOuterJoin, true);
+ }
// create a dummy select to select all columns
genSelectPlan(pctx, mapJoinOp);
return mapJoinOp;
diff --git ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out
index f24ae73..835ee60 100644
--- ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out
+++ ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out
@@ -104,59 +104,60 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
+ alias: b
+ Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table1
+ base file name: table2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -166,67 +167,93 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table1
- name: default.table1
+ name: default.table2
+ name: default.table2
Truncated Path -> Alias:
- /table1 [a]
- Map 4
+ /table2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
+ alias: a
+ Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table2
+ base file name: table1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -236,47 +263,26 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table2
- name: default.table2
+ name: default.table1
+ name: default.table1
Truncated Path -> Alias:
- /table2 [b]
+ /table1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out
index 33e9e8b..73ba0b4 100644
--- ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out
+++ ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out
@@ -104,59 +104,60 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
+ alias: b
+ Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table1
+ base file name: table2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -166,67 +167,93 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table1
+ name default.table2
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table1 { string key, string value}
+ serialization.ddl struct table2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 20
+ totalSize 21
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table1
- name: default.table1
+ name: default.table2
+ name: default.table2
Truncated Path -> Alias:
- /table1 [a]
- Map 4
+ /table2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE
+ alias: a
+ Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: table2
+ base file name: table1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -236,47 +263,26 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 1
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.table2
+ name default.table1
numFiles 1
numRows 0
rawDataSize 0
- serialization.ddl struct table2 { string key, string value}
+ serialization.ddl struct table1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 21
+ totalSize 20
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.table2
- name: default.table2
+ name: default.table1
+ name: default.table1
Truncated Path -> Alias:
- /table2 [b]
+ /table1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out
index aaa0151..b18e02f 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out
@@ -91,54 +91,26 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Local Work:
+ Map Reduce Local Work
+
Stage: Stage-1
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
- Map 3
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2
- columns.types int:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -227,54 +199,26 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
- Map 3
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2
- columns.types int:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -456,193 +400,196 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: srcbucket_mapjoin
+ base file name: ds=2008-04-08
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
- numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ name default.srcbucket_mapjoin_part
+ numFiles 4
+ numRows 0
+ partition_columns ds
+ partition_columns.types string
+ rawDataSize 0
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
- numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ name default.srcbucket_mapjoin_part
+ partition_columns ds
+ partition_columns.types string
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin
- name: default.srcbucket_mapjoin
+ name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
- /srcbucket_mapjoin [a]
- Map 3
+ /srcbucket_mapjoin_part/ds=2008-04-08 [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: ds=2008-04-08
+ base file name: srcbucket_mapjoin
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- partition values:
- ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- numFiles 4
- numRows 0
- partition_columns ds
- partition_columns.types string
- rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ name default.srcbucket_mapjoin
+ numFiles 2
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ COLUMN_STATS_ACCURATE true
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- partition_columns ds
- partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ name default.srcbucket_mapjoin
+ numFiles 2
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part
- name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin
+ name: default.srcbucket_mapjoin
Truncated Path -> Alias:
- /srcbucket_mapjoin_part/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin [a]
Stage: Stage-0
Move Operator
@@ -841,15 +788,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -862,14 +808,16 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 1
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -917,7 +865,12 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
/srcbucket_mapjoin [a]
- Map 3
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -927,14 +880,57 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Position of Big Table: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ numFiles 1
+ numRows 464
+ rawDataSize 8519
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 8983
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -987,52 +983,6 @@ STAGE PLANS:
name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
/srcbucket_mapjoin_part/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- numFiles 3
- numRows 464
- rawDataSize 8519
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 8983
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -1050,7 +1000,7 @@ STAGE PLANS:
columns.types string:string:string
#### A masked pattern was here ####
name default.bucketmapjoin_tmp_result
- numFiles 3
+ numFiles 1
numRows 464
rawDataSize 8519
serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out
index 9954b77..eda8ca2 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out
@@ -192,34 +192,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 1737 Data size: 6950 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -232,22 +233,22 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 2
+ name default.srcbucket_mapjoin_part_2
+ numFiles 3
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -260,16 +261,16 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -279,22 +280,22 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 3
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 3
+ name default.srcbucket_mapjoin_part_2
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 4200
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -307,36 +308,62 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_2/part=2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 1737 Data size: 6950 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 955 Data size: 3824 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -349,22 +376,22 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 3
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 3
+ name default.srcbucket_mapjoin_part_1
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 4200
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -377,16 +404,16 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -396,22 +423,22 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 2
+ name default.srcbucket_mapjoin_part_1
+ numFiles 3
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -424,41 +451,20 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
- /srcbucket_mapjoin_part_2/part=2 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 955 Data size: 3824 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out
index ad8f0a5..bb7214c 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out
@@ -202,34 +202,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -242,44 +243,44 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 2
+ name default.srcbucket_mapjoin_part_2
+ numFiles 4
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -289,64 +290,90 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 4
+ name default.srcbucket_mapjoin_part_2
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_2/part=2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -359,44 +386,44 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 4
+ name default.srcbucket_mapjoin_part_1
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -406,69 +433,48 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 2
+ name default.srcbucket_mapjoin_part_1
+ numFiles 4
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
- /srcbucket_mapjoin_part_2/part=2 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
@@ -603,34 +609,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int), part (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: int), part (type: string)
- Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int), part (type: string)
+ 1 key (type: int), part (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -643,44 +650,44 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 2
+ name default.srcbucket_mapjoin_part_2
+ numFiles 4
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -690,64 +697,90 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 4
+ name default.srcbucket_mapjoin_part_2
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_2/part=2 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int), part (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: int), part (type: string)
- Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int), part (type: string)
+ 1 key (type: int), part (type: string)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -760,44 +793,44 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 4
+ name default.srcbucket_mapjoin_part_1
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
#### A masked pattern was here ####
Partition
base file name: part=2
@@ -807,69 +840,48 @@ STAGE PLANS:
part 2
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 2
+ name default.srcbucket_mapjoin_part_1
+ numFiles 4
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
- /srcbucket_mapjoin_part_2/part=2 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out
index aa3e2b6..c0adef4 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out
@@ -161,34 +161,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -207,13 +208,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -223,41 +224,66 @@ STAGE PLANS:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
- bucket_field_name key
+ bucket_count -1
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -276,13 +302,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -292,45 +318,25 @@ STAGE PLANS:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count -1
+ bucket_count 2
+ bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
@@ -455,34 +461,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -495,19 +502,18 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
- bucket_field_name key
+ bucket_count -1
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_3
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -523,35 +529,61 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_3
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_3
+ name: default.srcbucket_mapjoin_part_3
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- Map 4
+ /srcbucket_mapjoin_part_3/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -564,18 +596,19 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count -1
+ bucket_count 2
+ bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_3
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -591,40 +624,19 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_3
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_3
- name: default.srcbucket_mapjoin_part_3
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_3/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out
index 44233f6..98d0706 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out
@@ -120,34 +120,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -161,18 +162,18 @@ STAGE PLANS:
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -188,27 +189,75 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
+ Truncated Path -> Alias:
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
+ GatherStats: false
+ Filter Operator
+ isSamplingPred: false
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=2
+ base file name: part=1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 2
+ part 1
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types int:string
@@ -245,36 +294,13 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.srcbucket_mapjoin_part_1
name: default.srcbucket_mapjoin_part_1
- Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
- Map Operator Tree:
- TableScan
- alias: b
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- GatherStats: false
- Filter Operator
- isSamplingPred: false
- predicate: key is not null (type: boolean)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
- Path -> Alias:
-#### A masked pattern was here ####
- Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=1
+ base file name: part=2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 1
+ part 2
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -283,13 +309,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -305,40 +331,20 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
@@ -458,44 +464,45 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=2
+ base file name: part=1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 2
+ part 1
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -504,13 +511,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -526,45 +533,71 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=1
+ base file name: part=2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 1
+ part 2
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -573,13 +606,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -595,40 +628,19 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
@@ -747,44 +759,45 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=2
+ base file name: part=1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 2
+ part 1
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -793,13 +806,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -815,45 +828,71 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=1
+ base file name: part=2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 1
+ part 2
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -862,13 +901,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -884,40 +923,19 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
@@ -1038,44 +1056,45 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=2
+ base file name: part=1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 2
+ part 1
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -1084,13 +1103,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -1101,50 +1120,76 @@ STAGE PLANS:
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
bucket_count 2
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=2 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: part=1
+ base file name: part=2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- part 1
+ part 2
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -1153,13 +1198,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 500
partition_columns part
partition_columns.types string
rawDataSize 5312
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
@@ -1170,45 +1215,24 @@ STAGE PLANS:
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
bucket_count 2
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=2 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out
index c4702ef..ea763c7 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out
@@ -158,35 +158,36 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -199,64 +200,107 @@ STAGE PLANS:
ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- numFiles 4
+ name default.srcbucket_mapjoin_part_2
+ numFiles 2
numRows 0
partition_columns ds
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 3062
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
+ name default.srcbucket_mapjoin_part_2
partition_columns ds
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part
- name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part/ds=2008-04-08 [a]
- Map 3
+ /srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col7
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -269,87 +313,46 @@ STAGE PLANS:
ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 2
+ name default.srcbucket_mapjoin_part
+ numFiles 4
numRows 0
partition_columns ds
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 3062
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part
partition_columns ds
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin_part/ds=2008-04-08 [a]
Stage: Stage-0
Move Operator
@@ -552,15 +555,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -573,14 +575,16 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 1
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -633,7 +637,12 @@ STAGE PLANS:
name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
/srcbucket_mapjoin_part/ds=2008-04-08 [a]
- Map 3
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -643,14 +652,57 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col7
+ input vertices:
+ 0 Map 1
+ Position of Big Table: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ numFiles 1
+ numRows 564
+ rawDataSize 10503
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11067
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -703,52 +755,6 @@ STAGE PLANS:
name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
/srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- numFiles 3
- numRows 564
- rawDataSize 10503
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 11067
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -766,7 +772,7 @@ STAGE PLANS:
columns.types string:string:string
#### A masked pattern was here ####
name default.bucketmapjoin_tmp_result
- numFiles 3
+ numFiles 1
numRows 564
rawDataSize 10503
serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
@@ -968,35 +974,36 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -1009,74 +1016,51 @@ STAGE PLANS:
ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- numFiles 4
+ name default.srcbucket_mapjoin_part_2
+ numFiles 2
numRows 0
partition_columns ds
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 3062
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
+ name default.srcbucket_mapjoin_part_2
partition_columns ds
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part
- name: default.srcbucket_mapjoin_part
- Truncated Path -> Alias:
- /srcbucket_mapjoin_part/ds=2008-04-08 [a]
- Map 3
- Map Operator Tree:
- TableScan
- alias: b
- Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE
- GatherStats: false
- Filter Operator
- isSamplingPred: false
- predicate: key is not null (type: boolean)
- Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
- Path -> Alias:
-#### A masked pattern was here ####
- Path -> Partition:
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
#### A masked pattern was here ####
Partition
- base file name: ds=2008-04-08
+ base file name: ds=2008-04-09
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- ds 2008-04-08
+ ds 2008-04-09
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -1117,102 +1101,127 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.srcbucket_mapjoin_part_2
name: default.srcbucket_mapjoin_part_2
+ Truncated Path -> Alias:
+ /srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
+ /srcbucket_mapjoin_part_2/ds=2008-04-09 [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ GatherStats: false
+ Filter Operator
+ isSamplingPred: false
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col7
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ numFiles 1
+ numRows 564
+ rawDataSize 10503
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11067
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: ds=2008-04-09
+ base file name: ds=2008-04-08
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- ds 2008-04-09
+ ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 2
+ name default.srcbucket_mapjoin_part
+ numFiles 4
numRows 0
partition_columns ds
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 3062
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part
partition_columns ds
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
- /srcbucket_mapjoin_part_2/ds=2008-04-09 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- numFiles 3
- numRows 564
- rawDataSize 10503
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 11067
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin_part/ds=2008-04-08 [a]
Stage: Stage-0
Move Operator
@@ -1230,7 +1239,7 @@ STAGE PLANS:
columns.types string:string:string
#### A masked pattern was here ####
name default.bucketmapjoin_tmp_result
- numFiles 3
+ numFiles 1
numRows 564
rawDataSize 10503
serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out
index 7c31e05..1b31561 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out
@@ -189,35 +189,36 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -230,64 +231,107 @@ STAGE PLANS:
ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 2
+ name default.srcbucket_mapjoin_part
+ numFiles 4
numRows 0
partition_columns ds
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 3062
+ totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 4
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part
partition_columns ds
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/ds=2008-04-08 [a]
- Map 3
+ /srcbucket_mapjoin_part/ds=2008-04-08 [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col7
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -300,87 +344,46 @@ STAGE PLANS:
ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- numFiles 4
+ name default.srcbucket_mapjoin_part_2
+ numFiles 2
numRows 0
partition_columns ds
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 5812
+ totalSize 3062
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 4
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
+ name default.srcbucket_mapjoin_part_2
partition_columns ds
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part
- name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin_part_2/ds=2008-04-08 [a]
Stage: Stage-0
Move Operator
@@ -590,15 +593,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -611,14 +613,16 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 1
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -671,7 +675,12 @@ STAGE PLANS:
name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
/srcbucket_mapjoin_part_2/ds=2008-04-08 [a]
- Map 3
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -681,14 +690,57 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col7
+ input vertices:
+ 0 Map 1
+ Position of Big Table: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ numFiles 1
+ numRows 564
+ rawDataSize 10503
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 11067
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -741,52 +793,6 @@ STAGE PLANS:
name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
/srcbucket_mapjoin_part/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col7
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- numFiles 3
- numRows 564
- rawDataSize 10503
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 11067
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -804,7 +810,7 @@ STAGE PLANS:
columns.types string:string:string
#### A masked pattern was here ####
name default.bucketmapjoin_tmp_result
- numFiles 3
+ numFiles 1
numRows 564
rawDataSize 10503
serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out
index a8e892e..97d2d74 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out
@@ -175,35 +175,36 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -250,25 +251,68 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
- /srcbucket_mapjoin [a]
- Map 3
+ /srcbucket_mapjoin [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -315,48 +359,7 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
- /srcbucket_mapjoin [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin [a]
Stage: Stage-0
Move Operator
@@ -540,15 +543,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -561,14 +563,16 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 1
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -616,7 +620,12 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
/srcbucket_mapjoin [a]
- Map 3
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -626,14 +635,57 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Position of Big Table: 1
+ Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ numFiles 2
+ numRows 464
+ rawDataSize 8519
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 8983
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -681,52 +733,6 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
/srcbucket_mapjoin [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- numFiles 3
- numRows 464
- rawDataSize 8519
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 8983
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -744,7 +750,7 @@ STAGE PLANS:
columns.types string:string:string
#### A masked pattern was here ####
name default.bucketmapjoin_tmp_result
- numFiles 3
+ numFiles 2
numRows 464
rawDataSize 8519
serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out
index 041ba12..94952a1 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out
@@ -225,15 +225,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -246,14 +245,16 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 1
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -301,7 +302,12 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
/srcbucket_mapjoin [a]
- Map 3
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -311,14 +317,52 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Position of Big Table: 1
+ Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -419,47 +463,6 @@ STAGE PLANS:
Truncated Path -> Alias:
/srcbucket_mapjoin_part/ds=2008-04-08 [b]
/srcbucket_mapjoin_part/ds=2008-04-09 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -655,15 +658,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -676,14 +678,16 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 1
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -731,7 +735,12 @@ STAGE PLANS:
name: default.srcbucket_mapjoin
Truncated Path -> Alias:
/srcbucket_mapjoin [a]
- Map 3
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -741,14 +750,57 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Position of Big Table: 1
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ numFiles 1
+ numRows 928
+ rawDataSize 17038
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 17966
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -849,52 +901,6 @@ STAGE PLANS:
Truncated Path -> Alias:
/srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
/srcbucket_mapjoin_part_2/ds=2008-04-09 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- numFiles 3
- numRows 928
- rawDataSize 17038
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 17966
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -912,7 +918,7 @@ STAGE PLANS:
columns.types string:string:string
#### A masked pattern was here ####
name default.bucketmapjoin_tmp_result
- numFiles 3
+ numFiles 1
numRows 928
rawDataSize 17038
serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out
index 54c4be3..ca59d02 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out
@@ -140,34 +140,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -187,13 +188,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 0
partition_columns ds/hr
partition_columns.types string:string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -209,36 +210,61 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns ds/hr
partition_columns.types string:string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/ds=2008-04-08/hr=0 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/ds=2008-04-08/hr=0 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (SORT, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col8
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col8 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: string)
+ sort order: ++
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -258,13 +284,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 0
partition_columns ds/hr
partition_columns.types string:string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -280,40 +306,19 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns ds/hr
partition_columns.types string:string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/ds=2008-04-08/hr=0 [b]
+ /srcbucket_mapjoin_part_1/ds=2008-04-08/hr=0 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col8
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col8 (type: string)
- outputColumnNames: _col0, _col1
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: int), _col1 (type: string)
- sort order: ++
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Select Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out
index da9fe1c..f419eaf 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out
@@ -126,34 +126,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -172,13 +173,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -188,41 +189,67 @@ STAGE PLANS:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -241,13 +268,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -257,46 +284,25 @@ STAGE PLANS:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 3
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
@@ -429,34 +435,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -475,13 +482,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -492,40 +499,66 @@ STAGE PLANS:
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
bucket_count 2
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -544,13 +577,13 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -561,45 +594,24 @@ STAGE PLANS:
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
bucket_count 2
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out
index 5a5e3f6..f439a14 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out
@@ -134,34 +134,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 1050 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Statistics: Num rows: 525 Data size: 2100 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -174,22 +175,22 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
- numFiles 2
+ name default.srcbucket_mapjoin_part_2
+ numFiles 3
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -202,35 +203,61 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 1050 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 525 Data size: 2100 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 525 Data size: 2100 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 577 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -243,22 +270,22 @@ STAGE PLANS:
part 1
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 3
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- numFiles 3
+ name default.srcbucket_mapjoin_part_1
+ numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 4200
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
@@ -271,40 +298,19 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 577 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
@@ -470,34 +476,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: a
+ alias: b
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -511,18 +518,18 @@ STAGE PLANS:
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -538,35 +545,61 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_1
+ name default.srcbucket_mapjoin_part_2
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_1
- name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_1/part=1 [a]
- Map 4
+ /srcbucket_mapjoin_part_2/part=1 [b]
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
+ alias: a
Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ input vertices:
+ 1 Map 3
+ Position of Big Table: 0
+ Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ value expressions: _col0 (type: bigint)
+ auto parallelism: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -580,18 +613,18 @@ STAGE PLANS:
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
numFiles 2
numRows 0
partition_columns part
partition_columns.types string
rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 2750
@@ -607,40 +640,19 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin_part_1
partition_columns part
partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_1
+ name: default.srcbucket_mapjoin_part_1
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/part=1 [b]
+ /srcbucket_mapjoin_part_1/part=1 [a]
Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0
- 1
- Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count()
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
- tag: -1
- value expressions: _col0 (type: bigint)
- auto parallelism: false
- Reducer 3
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out
index 5ac3f4c..a2bf84c 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out
@@ -133,193 +133,196 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 40 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 20 Data size: 2100 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: srcbucket_mapjoin
+ base file name: ds=2008-04-08
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
- numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ name default.srcbucket_mapjoin_part
+ numFiles 3
+ numRows 0
+ partition_columns ds
+ partition_columns.types string
+ rawDataSize 0
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- COLUMN_STATS_ACCURATE true
- bucket_count 2
+ bucket_count 3
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
- numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ name default.srcbucket_mapjoin_part
+ partition_columns ds
+ partition_columns.types string
+ serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin
- name: default.srcbucket_mapjoin
+ name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin_part
Truncated Path -> Alias:
- /srcbucket_mapjoin [a]
- Map 3
+ /srcbucket_mapjoin_part/ds=2008-04-08 [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 40 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 20 Data size: 2100 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 20 Data size: 2100 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: ds=2008-04-08
+ base file name: srcbucket_mapjoin
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- partition values:
- ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
- bucket_count 3
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- numFiles 3
- numRows 0
- partition_columns ds
- partition_columns.types string
- rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ name default.srcbucket_mapjoin
+ numFiles 2
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 4200
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- bucket_count 3
+ COLUMN_STATS_ACCURATE true
+ bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part
- partition_columns ds
- partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value}
+ name default.srcbucket_mapjoin
+ numFiles 2
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part
- name: default.srcbucket_mapjoin_part
+ name: default.srcbucket_mapjoin
+ name: default.srcbucket_mapjoin
Truncated Path -> Alias:
- /srcbucket_mapjoin_part/ds=2008-04-08 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin [a]
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out
index e4ff965..b954feb 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out
@@ -135,43 +135,46 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: key is not null (type: boolean)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: value (type: string)
- auto parallelism: false
+ Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: srcbucket_mapjoin
+ base file name: ds=2008-04-08
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ partition values:
+ ds 2008-04-08
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -180,65 +183,45 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
+ name default.srcbucket_mapjoin_part_2
numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ numRows 0
+ partition_columns ds
+ partition_columns.types string
+ rawDataSize 0
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
+ totalSize 3062
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
- COLUMN_STATS_ACCURATE true
bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin
- numFiles 2
- serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
+ name default.srcbucket_mapjoin_part_2
+ partition_columns ds
+ partition_columns.types string
+ serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin
- name: default.srcbucket_mapjoin
- Truncated Path -> Alias:
- /srcbucket_mapjoin [a]
- Map 3
- Map Operator Tree:
- TableScan
- alias: b
- Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE
- GatherStats: false
- Filter Operator
- isSamplingPred: false
- predicate: key is not null (type: boolean)
- Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
- Path -> Alias:
-#### A masked pattern was here ####
- Path -> Partition:
+ name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin_part_2
#### A masked pattern was here ####
Partition
- base file name: ds=2008-04-08
+ base file name: ds=2008-04-09
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
partition values:
- ds 2008-04-08
+ ds 2008-04-09
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -279,13 +262,78 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.srcbucket_mapjoin_part_2
name: default.srcbucket_mapjoin_part_2
+ Truncated Path -> Alias:
+ /srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
+ /srcbucket_mapjoin_part_2/ds=2008-04-09 [b]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE
+ GatherStats: false
+ Filter Operator
+ isSamplingPred: false
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value1,value2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.bucketmapjoin_tmp_result
+ serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.bucketmapjoin_tmp_result
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: ds=2008-04-09
+ base file name: srcbucket_mapjoin
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- partition values:
- ds 2008-04-09
properties:
COLUMN_STATS_ACCURATE true
bucket_count 2
@@ -294,82 +342,37 @@ STAGE PLANS:
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
+ name default.srcbucket_mapjoin
numFiles 2
- numRows 0
- partition_columns ds
- partition_columns.types string
- rawDataSize 0
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 3062
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
+ COLUMN_STATS_ACCURATE true
bucket_count 2
bucket_field_name key
columns key,value
columns.comments
columns.types int:string
#### A masked pattern was here ####
- name default.srcbucket_mapjoin_part_2
- partition_columns ds
- partition_columns.types string
- serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value}
+ name default.srcbucket_mapjoin
+ numFiles 2
+ serialization.ddl struct srcbucket_mapjoin { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2750
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.srcbucket_mapjoin_part_2
- name: default.srcbucket_mapjoin_part_2
+ name: default.srcbucket_mapjoin
+ name: default.srcbucket_mapjoin
Truncated Path -> Alias:
- /srcbucket_mapjoin_part_2/ds=2008-04-08 [b]
- /srcbucket_mapjoin_part_2/ds=2008-04-09 [b]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value1,value2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.bucketmapjoin_tmp_result
- serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.bucketmapjoin_tmp_result
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
+ /srcbucket_mapjoin [a]
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out
index fce5566..bfe5438 100644
--- ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out
+++ ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out
@@ -195,33 +195,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -270,24 +272,63 @@ STAGE PLANS:
name: default.test1
name: default.test1
Truncated Path -> Alias:
- /test1 [l]
- Map 3
+ /test1 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -336,43 +377,7 @@ STAGE PLANS:
name: default.test1
name: default.test1
Truncated Path -> Alias:
- /test1 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test1 [l]
Stage: Stage-0
Fetch Operator
@@ -431,33 +436,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -506,24 +513,63 @@ STAGE PLANS:
name: default.test2
name: default.test2
Truncated Path -> Alias:
- /test2 [l]
- Map 3
+ /test2 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -572,43 +618,7 @@ STAGE PLANS:
name: default.test2
name: default.test2
Truncated Path -> Alias:
- /test2 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test2 [l]
Stage: Stage-0
Fetch Operator
@@ -664,34 +674,35 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
- predicate: (key + key) is not null (type: boolean)
+ predicate: UDFToDouble(key) is not null (type: boolean)
Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: (key + key) (type: double)
- sort order: +
- Map-reduce partition columns: (key + key) (type: double)
- Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: key (type: string), value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 (key + key) (type: double)
+ 1 UDFToDouble(key) (type: double)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -740,25 +751,63 @@ STAGE PLANS:
name: default.test1
name: default.test1
Truncated Path -> Alias:
- /test1 [l]
- Map 3
+ /test1 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
- predicate: UDFToDouble(key) is not null (type: boolean)
+ predicate: (key + key) is not null (type: boolean)
Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: UDFToDouble(key) (type: double)
- sort order: +
- Map-reduce partition columns: UDFToDouble(key) (type: double)
- Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: key (type: string), value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 (key + key) (type: double)
+ 1 UDFToDouble(key) (type: double)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -807,43 +856,7 @@ STAGE PLANS:
name: default.test1
name: default.test1
Truncated Path -> Alias:
- /test1 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {VALUE._col0} {VALUE._col1}
- 1 {VALUE._col0} {VALUE._col1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test1 [l]
Stage: Stage-0
Fetch Operator
@@ -902,53 +915,55 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test1
+ base file name: test2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test1
+ name default.test2
numFiles 3
- serialization.ddl struct test1 { string key, string value}
+ serialization.ddl struct test2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -961,60 +976,99 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test1
+ name default.test2
numFiles 3
- serialization.ddl struct test1 { string key, string value}
+ serialization.ddl struct test2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test1
- name: default.test1
+ name: default.test2
+ name: default.test2
Truncated Path -> Alias:
- /test1 [l]
- Map 3
+ /test2 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test2
+ base file name: test1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test2
+ name default.test1
numFiles 3
- serialization.ddl struct test2 { string key, string value}
+ serialization.ddl struct test1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1027,59 +1081,23 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test2
+ name default.test1
numFiles 3
- serialization.ddl struct test2 { string key, string value}
+ serialization.ddl struct test1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test2
- name: default.test2
+ name: default.test1
+ name: default.test1
Truncated Path -> Alias:
- /test2 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test1 [l]
Stage: Stage-0
Fetch Operator
@@ -1138,39 +1156,41 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test1
+ base file name: test3
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
@@ -1182,9 +1202,9 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test1
+ name default.test3
numFiles 3
- serialization.ddl struct test1 { string key, string value}
+ serialization.ddl struct test3 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1202,41 +1222,80 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test1
+ name default.test3
numFiles 3
- serialization.ddl struct test1 { string key, string value}
+ serialization.ddl struct test3 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test1
- name: default.test1
+ name: default.test3
+ name: default.test3
Truncated Path -> Alias:
- /test1 [l]
- Map 3
+ /test3 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test3
+ base file name: test1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
@@ -1248,9 +1307,9 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test3
+ name default.test1
numFiles 3
- serialization.ddl struct test3 { string key, string value}
+ serialization.ddl struct test1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1268,54 +1327,18 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test3
+ name default.test1
numFiles 3
- serialization.ddl struct test3 { string key, string value}
+ serialization.ddl struct test1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test3
- name: default.test3
+ name: default.test1
+ name: default.test1
Truncated Path -> Alias:
- /test3 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test1 [l]
Stage: Stage-0
Fetch Operator
@@ -1374,53 +1397,55 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test1
+ base file name: test4
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test1
+ name default.test4
numFiles 3
- serialization.ddl struct test1 { string key, string value}
+ serialization.ddl struct test4 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1433,60 +1458,99 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test1
+ name default.test4
numFiles 3
- serialization.ddl struct test1 { string key, string value}
+ serialization.ddl struct test4 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test1
- name: default.test1
+ name: default.test4
+ name: default.test4
Truncated Path -> Alias:
- /test1 [l]
- Map 3
+ /test4 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test4
+ base file name: test1
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test4
+ name default.test1
numFiles 3
- serialization.ddl struct test4 { string key, string value}
+ serialization.ddl struct test1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1499,59 +1563,23 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test4
+ name default.test1
numFiles 3
- serialization.ddl struct test4 { string key, string value}
+ serialization.ddl struct test1 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test4
- name: default.test4
+ name: default.test1
+ name: default.test1
Truncated Path -> Alias:
- /test4 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test1 [l]
Stage: Stage-0
Fetch Operator
@@ -1610,53 +1638,55 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test2
+ base file name: test3
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test2
+ name default.test3
numFiles 3
- serialization.ddl struct test2 { string key, string value}
+ serialization.ddl struct test3 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1669,60 +1699,99 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test2
+ name default.test3
numFiles 3
- serialization.ddl struct test2 { string key, string value}
+ serialization.ddl struct test3 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test2
- name: default.test2
+ name: default.test3
+ name: default.test3
Truncated Path -> Alias:
- /test2 [l]
- Map 3
+ /test3 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test3
+ base file name: test2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test3
+ name default.test2
numFiles 3
- serialization.ddl struct test3 { string key, string value}
+ serialization.ddl struct test2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1735,59 +1804,23 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test3
+ name default.test2
numFiles 3
- serialization.ddl struct test3 { string key, string value}
+ serialization.ddl struct test2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test3
- name: default.test3
+ name: default.test2
+ name: default.test2
Truncated Path -> Alias:
- /test3 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test2 [l]
Stage: Stage-0
Fetch Operator
@@ -1846,39 +1879,41 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test2
+ base file name: test4
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
@@ -1890,9 +1925,9 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test2
+ name default.test4
numFiles 3
- serialization.ddl struct test2 { string key, string value}
+ serialization.ddl struct test4 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1910,41 +1945,80 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test2
+ name default.test4
numFiles 3
- serialization.ddl struct test2 { string key, string value}
+ serialization.ddl struct test4 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test2
- name: default.test2
+ name: default.test4
+ name: default.test4
Truncated Path -> Alias:
- /test2 [l]
- Map 3
+ /test4 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test4
+ base file name: test2
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
@@ -1956,9 +2030,9 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test4
+ name default.test2
numFiles 3
- serialization.ddl struct test4 { string key, string value}
+ serialization.ddl struct test2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -1976,54 +2050,18 @@ STAGE PLANS:
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test4
+ name default.test2
numFiles 3
- serialization.ddl struct test4 { string key, string value}
+ serialization.ddl struct test2 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test4
- name: default.test4
+ name: default.test2
+ name: default.test2
Truncated Path -> Alias:
- /test4 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test2 [l]
Stage: Stage-0
Fetch Operator
@@ -2082,53 +2120,55 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: l
+ alias: r
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ Position of Big Table: 0
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test3
+ base file name: test4
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test3
+ name default.test4
numFiles 3
- serialization.ddl struct test3 { string key, string value}
+ serialization.ddl struct test4 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -2141,60 +2181,99 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name key
+ bucket_field_name value
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test3
+ name default.test4
numFiles 3
- serialization.ddl struct test3 { string key, string value}
+ serialization.ddl struct test4 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test3
- name: default.test3
+ name: default.test4
+ name: default.test4
Truncated Path -> Alias:
- /test3 [l]
- Map 3
+ /test4 [r]
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: r
+ alias: l
Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: (key is not null and value is not null) (type: boolean)
Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string), value (type: string)
- sort order: ++
- Map-reduce partition columns: key (type: string), value (type: string)
- Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string), value (type: string)
+ 1 key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 2
+ Position of Big Table: 0
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2,_col3
+ columns.types string:string:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
- base file name: test4
+ base file name: test3
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test4
+ name default.test3
numFiles 3
- serialization.ddl struct test4 { string key, string value}
+ serialization.ddl struct test3 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
@@ -2207,59 +2286,23 @@ STAGE PLANS:
COLUMN_STATS_ACCURATE true
SORTBUCKETCOLSPREFIX TRUE
bucket_count 3
- bucket_field_name value
+ bucket_field_name key
columns key,value
columns.comments
columns.types string:string
#### A masked pattern was here ####
- name default.test4
+ name default.test3
numFiles 3
- serialization.ddl struct test4 { string key, string value}
+ serialization.ddl struct test3 { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 4200
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.test4
- name: default.test4
+ name: default.test3
+ name: default.test3
Truncated Path -> Alias:
- /test4 [r]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 0
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- columns _col0,_col1,_col2,_col3
- columns.types string:string:string:string
- escape.delim \
- hive.serialization.extend.nesting.levels true
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- TotalFiles: 1
- GatherStats: false
- MultiFileSpray: false
+ /test3 [l]
Stage: Stage-0
Fetch Operator
diff --git ql/src/test/results/clientpositive/spark/join25.q.out ql/src/test/results/clientpositive/spark/join25.q.out
index 284c97d..84e496a 100644
--- ql/src/test/results/clientpositive/spark/join25.q.out
+++ ql/src/test/results/clientpositive/spark/join25.q.out
@@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(x) */ x.key, x.value, y.value
FROM src1 x JOIN src y ON (x.key = y.key)
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -40,13 +39,21 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -54,34 +61,33 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/join26.q.out ql/src/test/results/clientpositive/spark/join26.q.out
index e271184..8731942 100644
--- ql/src/test/results/clientpositive/spark/join26.q.out
+++ ql/src/test/results/clientpositive/spark/join26.q.out
@@ -102,15 +102,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -123,13 +122,18 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ 2 key (type: string)
+ Position of Big Table: 2
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -179,7 +183,7 @@ STAGE PLANS:
name: default.src1
Truncated Path -> Alias:
/src1 [x]
- Map 3
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -189,14 +193,18 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ 2 key (type: string)
+ Position of Big Table: 2
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -246,7 +254,12 @@ STAGE PLANS:
name: default.src
Truncated Path -> Alias:
/src [y]
- Map 4
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 3
Map Operator Tree:
TableScan
alias: z
@@ -256,14 +269,56 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 2
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ 2 key (type: string)
+ outputColumnNames: _col0, _col6, _col11
+ input vertices:
+ 0 Map 1
+ 1 Map 2
+ Position of Big Table: 2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value,val2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.dest_j1
+ serialization.ddl struct dest_j1 { string key, string value, string val2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -315,49 +370,6 @@ STAGE PLANS:
name: default.srcpart
Truncated Path -> Alias:
/srcpart/ds=2008-04-08/hr=11 [z]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- Inner Join 0 to 2
- condition expressions:
- 0 {KEY.reducesinkkey0}
- 1 {VALUE._col0}
- 2 {VALUE._col0}
- outputColumnNames: _col0, _col6, _col11
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value,val2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.dest_j1
- serialization.ddl struct dest_j1 { string key, string value, string val2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/join27.q.out ql/src/test/results/clientpositive/spark/join27.q.out
index d31f29e..a578279 100644
--- ql/src/test/results/clientpositive/spark/join27.q.out
+++ ql/src/test/results/clientpositive/spark/join27.q.out
@@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(x) */ x.key, x.value, y.value
FROM src1 x JOIN src y ON (x.value = y.value)
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -40,13 +39,21 @@ STAGE PLANS:
Filter Operator
predicate: value is not null (type: boolean)
Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: value (type: string)
- sort order: +
- Map-reduce partition columns: value (type: string)
- Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- value expressions: key (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ keys:
+ 0 value (type: string)
+ 1 value (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -54,33 +61,33 @@ STAGE PLANS:
Filter Operator
predicate: value is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: value (type: string)
- sort order: +
- Map-reduce partition columns: value (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {VALUE._col0} {KEY.reducesinkkey0}
- 1 {KEY.reducesinkkey0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 value (type: string)
+ 1 value (type: string)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/join30.q.out ql/src/test/results/clientpositive/spark/join30.q.out
index 7fbbcfa..f6d2117 100644
--- ql/src/test/results/clientpositive/spark/join30.q.out
+++ ql/src/test/results/clientpositive/spark/join30.q.out
@@ -19,16 +19,14 @@ INSERT OVERWRITE TABLE dest_j1
SELECT /*+ MAPJOIN(x) */ x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -39,12 +37,23 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Map 4
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 3 <- Map 2 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -52,33 +61,33 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0}
- 1
- outputColumnNames: _col0
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- aggregations: count(1)
- keys: _col0 (type: string)
- mode: hash
- outputColumnNames: _col0, _col1
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: bigint)
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col0
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count(1)
+ keys: _col0 (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Local Work:
+ Map Reduce Local Work
Reducer 3
Reduce Operator Tree:
Group By Operator
diff --git ql/src/test/results/clientpositive/spark/join36.q.out ql/src/test/results/clientpositive/spark/join36.q.out
index f1317ea..4850951 100644
--- ql/src/test/results/clientpositive/spark/join36.q.out
+++ ql/src/test/results/clientpositive/spark/join36.q.out
@@ -61,15 +61,14 @@ SELECT /*+ MAPJOIN(x) */ x.key, x.cnt, y.cnt
FROM tmp1 x JOIN tmp2 y ON (x.key = y.key)
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -80,13 +79,21 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE
- value expressions: cnt (type: int)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {cnt}
+ 1 {cnt}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -94,34 +101,33 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE
- value expressions: cnt (type: int)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: int), _col1 (type: int), _col6 (type: int)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {cnt}
+ 1 {cnt}
+ keys:
+ 0 key (type: int)
+ 1 key (type: int)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: int), _col6 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/join37.q.out ql/src/test/results/clientpositive/spark/join37.q.out
index 448e983..bdb9aa6 100644
--- ql/src/test/results/clientpositive/spark/join37.q.out
+++ ql/src/test/results/clientpositive/spark/join37.q.out
@@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(X) */ x.key, x.value, y.value
FROM src1 x JOIN src y ON (x.key = y.key)
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -40,13 +39,21 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -54,34 +61,33 @@ STAGE PLANS:
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {VALUE._col0}
- outputColumnNames: _col0, _col1, _col6
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col0, _col1, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/join38.q.out ql/src/test/results/clientpositive/spark/join38.q.out
index 735d7ea..330800e 100644
--- ql/src/test/results/clientpositive/spark/join38.q.out
+++ ql/src/test/results/clientpositive/spark/join38.q.out
@@ -58,14 +58,14 @@ STAGE PLANS:
Spark
#### A masked pattern was here ####
Vertices:
- Map 3
+ Map 1
Map Operator Tree:
TableScan
- alias: b
- Statistics: Num rows: 2 Data size: 126 Basic stats: COMPLETE Column stats: NONE
+ alias: a
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: (col11 = 111) (type: boolean)
- Statistics: Num rows: 1 Data size: 63 Basic stats: COMPLETE Column stats: NONE
+ predicate: (key = 111) (type: boolean)
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
Spark HashTable Sink Operator
condition expressions:
0 {value}
@@ -79,17 +79,17 @@ STAGE PLANS:
Stage: Stage-1
Spark
Edges:
- Reducer 2 <- Map 1 (GROUP, 3)
+ Reducer 3 <- Map 2 (GROUP, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 2
Map Operator Tree:
TableScan
- alias: a
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ alias: b
+ Statistics: Num rows: 2 Data size: 126 Basic stats: COMPLETE Column stats: NONE
Filter Operator
- predicate: (key = 111) (type: boolean)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ predicate: (col11 = 111) (type: boolean)
+ Statistics: Num rows: 1 Data size: 63 Basic stats: COMPLETE Column stats: NONE
Map Join Operator
condition map:
Inner Join 0 to 1
@@ -101,7 +101,7 @@ STAGE PLANS:
1 '111' (type: string)
outputColumnNames: _col1, _col10
input vertices:
- 1 Map 3
+ 0 Map 1
Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: count(1)
@@ -117,7 +117,7 @@ STAGE PLANS:
value expressions: _col2 (type: bigint)
Local Work:
Map Reduce Local Work
- Reducer 2
+ Reducer 3
Reduce Operator Tree:
Group By Operator
aggregations: count(VALUE._col0)
diff --git ql/src/test/results/clientpositive/spark/join39.q.out ql/src/test/results/clientpositive/spark/join39.q.out
index 0734d4b..416c92e 100644
--- ql/src/test/results/clientpositive/spark/join39.q.out
+++ ql/src/test/results/clientpositive/spark/join39.q.out
@@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(y) */ x.key, x.value, y.key, y.value
FROM src x left outer JOIN (select * from src where key <= 100) y ON (x.key = y.key)
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -44,45 +43,52 @@ STAGE PLANS:
expressions: key (type: string), value (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {_col1}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: x
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Left Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
+ Map Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {_col0} {_col1}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 1
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/join40.q.out ql/src/test/results/clientpositive/spark/join40.q.out
index 60ef13d..6e990c6 100644
--- ql/src/test/results/clientpositive/spark/join40.q.out
+++ ql/src/test/results/clientpositive/spark/join40.q.out
@@ -3093,14 +3093,13 @@ SELECT /*+ MAPJOIN(y) */ x.key, x.value, y.key, y.value
FROM src x left outer JOIN (select * from src where key <= 100) y ON (x.key = y.key)
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -3115,44 +3114,51 @@ STAGE PLANS:
expressions: key (type: string), value (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key} {value}
+ 1 {_col1}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: x
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Left Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
+ Map Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {_col0} {_col1}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 1 Map 1
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
diff --git ql/src/test/results/clientpositive/spark/join_map_ppr.q.out ql/src/test/results/clientpositive/spark/join_map_ppr.q.out
index 59fdb99..91974f1 100644
--- ql/src/test/results/clientpositive/spark/join_map_ppr.q.out
+++ ql/src/test/results/clientpositive/spark/join_map_ppr.q.out
@@ -104,15 +104,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -125,13 +124,18 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ 2 key (type: string)
+ Position of Big Table: 2
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -181,7 +185,7 @@ STAGE PLANS:
name: default.src1
Truncated Path -> Alias:
/src1 [x]
- Map 3
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -191,14 +195,18 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ 2 key (type: string)
+ Position of Big Table: 2
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -248,7 +256,12 @@ STAGE PLANS:
name: default.src
Truncated Path -> Alias:
/src [y]
- Map 4
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 3
Map Operator Tree:
TableScan
alias: z
@@ -258,14 +271,56 @@ STAGE PLANS:
isSamplingPred: false
predicate: key is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 2
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ 2 key (type: string)
+ outputColumnNames: _col0, _col6, _col11
+ input vertices:
+ 0 Map 1
+ 1 Map 2
+ Position of Big Table: 2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value,val2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.dest_j1
+ serialization.ddl struct dest_j1 { string key, string value, string val2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -317,49 +372,6 @@ STAGE PLANS:
name: default.srcpart
Truncated Path -> Alias:
/srcpart/ds=2008-04-08/hr=11 [z]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- Inner Join 0 to 2
- condition expressions:
- 0 {KEY.reducesinkkey0}
- 1 {VALUE._col0}
- 2 {VALUE._col0}
- outputColumnNames: _col0, _col6, _col11
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- bucket_count -1
- columns key,value,val2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.dest_j1
- serialization.ddl struct dest_j1 { string key, string value, string val2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
@@ -657,15 +669,14 @@ TOK_QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-3
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -678,14 +689,18 @@ STAGE PLANS:
isSamplingPred: false
predicate: UDFToDouble(key) is not null (type: boolean)
Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: UDFToDouble(key) (type: double)
- sort order: +
- Map-reduce partition columns: UDFToDouble(key) (type: double)
- Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE
- tag: 0
- value expressions: key (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 UDFToDouble(key) (type: double)
+ 1 UDFToDouble(key) (type: double)
+ 2 UDFToDouble(key) (type: double)
+ Position of Big Table: 2
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -735,7 +750,7 @@ STAGE PLANS:
name: default.src1_copy
Truncated Path -> Alias:
/src1_copy [x]
- Map 3
+ Map 2
Map Operator Tree:
TableScan
alias: y
@@ -745,14 +760,18 @@ STAGE PLANS:
isSamplingPred: false
predicate: UDFToDouble(key) is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: UDFToDouble(key) (type: double)
- sort order: +
- Map-reduce partition columns: UDFToDouble(key) (type: double)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 1
- value expressions: value (type: string)
- auto parallelism: false
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 UDFToDouble(key) (type: double)
+ 1 UDFToDouble(key) (type: double)
+ 2 UDFToDouble(key) (type: double)
+ Position of Big Table: 2
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -802,7 +821,12 @@ STAGE PLANS:
name: default.src_copy
Truncated Path -> Alias:
/src_copy [y]
- Map 4
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 3
Map Operator Tree:
TableScan
alias: z
@@ -812,14 +836,61 @@ STAGE PLANS:
isSamplingPred: false
predicate: UDFToDouble(key) is not null (type: boolean)
Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: UDFToDouble(key) (type: double)
- sort order: +
- Map-reduce partition columns: UDFToDouble(key) (type: double)
- Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
- tag: 2
- value expressions: value (type: string)
- auto parallelism: false
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0 {key}
+ 1 {value}
+ 2 {value}
+ keys:
+ 0 UDFToDouble(key) (type: double)
+ 1 UDFToDouble(key) (type: double)
+ 2 UDFToDouble(key) (type: double)
+ outputColumnNames: _col0, _col6, _col11
+ input vertices:
+ 0 Map 1
+ 1 Map 2
+ Position of Big Table: 2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count -1
+ columns key,value,val2
+ columns.comments
+ columns.types string:string:string
+#### A masked pattern was here ####
+ name default.dest_j1
+ numFiles 1
+ numRows 107
+ rawDataSize 2018
+ serialization.ddl struct dest_j1 { string key, string value, string val2}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 2125
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest_j1
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Local Work:
+ Map Reduce Local Work
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
@@ -871,54 +942,6 @@ STAGE PLANS:
name: default.srcpart
Truncated Path -> Alias:
/srcpart/ds=2008-04-08/hr=11 [z]
- Reducer 2
- Needs Tagging: true
- Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- Inner Join 0 to 2
- condition expressions:
- 0 {VALUE._col0}
- 1 {VALUE._col1}
- 2 {VALUE._col1}
- outputColumnNames: _col0, _col6, _col11
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- GlobalTableId: 1
-#### A masked pattern was here ####
- NumFilesPerFileSink: 1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
-#### A masked pattern was here ####
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- properties:
- COLUMN_STATS_ACCURATE true
- bucket_count -1
- columns key,value,val2
- columns.comments
- columns.types string:string:string
-#### A masked pattern was here ####
- name default.dest_j1
- numFiles 1
- numRows 107
- rawDataSize 2018
- serialization.ddl struct dest_j1 { string key, string value, string val2}
- serialization.format 1
- serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- totalSize 2125
-#### A masked pattern was here ####
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.dest_j1
- TotalFiles: 1
- GatherStats: true
- MultiFileSpray: false
Stage: Stage-0
Move Operator
diff --git ql/src/test/results/clientpositive/spark/mapjoin1.q.out ql/src/test/results/clientpositive/spark/mapjoin1.q.out
index 80e38b9..0e5907c 100644
--- ql/src/test/results/clientpositive/spark/mapjoin1.q.out
+++ ql/src/test/results/clientpositive/spark/mapjoin1.q.out
@@ -30,14 +30,13 @@ EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND true limit 10
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -45,50 +44,60 @@ STAGE PLANS:
TableScan
alias: a
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {key} {value}
+ filter predicates:
+ 0
+ 1 {true}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Right Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- filter predicates:
- 0
- 1 {true}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Limit
- Number of rows: 10
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ filter predicates:
+ 0
+ 1 {true}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 10
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -104,16 +113,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-105 val_105 105 val_105
+238 val_238 238 val_238
+238 val_238 238 val_238
+86 val_86 86 val_86
+311 val_311 311 val_311
+311 val_311 311 val_311
+311 val_311 311 val_311
+27 val_27 27 val_27
+165 val_165 165 val_165
+165 val_165 165 val_165
+409 val_409 409 val_409
PREHOOK: query: -- func filter on outer join
EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10
@@ -123,14 +132,13 @@ EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -138,50 +146,60 @@ STAGE PLANS:
TableScan
alias: a
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {key} {value}
+ filter predicates:
+ 0
+ 1 {((key * 10) < '1000')}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Right Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- filter predicates:
- 0
- 1 {((KEY.reducesinkkey0 * 10) < '1000')}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Limit
- Number of rows: 10
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ filter predicates:
+ 0
+ 1 {((key * 10) < '1000')}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 10
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -197,16 +215,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-NULL NULL 105 val_105
+NULL NULL 238 val_238
+86 val_86 86 val_86
+NULL NULL 311 val_311
+27 val_27 27 val_27
+NULL NULL 165 val_165
+NULL NULL 409 val_409
+NULL NULL 255 val_255
+NULL NULL 278 val_278
+98 val_98 98 val_98
+98 val_98 98 val_98
PREHOOK: query: -- field filter on outer join
EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN
@@ -218,14 +236,35 @@ SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN
(select key, named_struct('key', key, 'value', value) as kv from src) b on a.key=b.key AND b.kv.key > 200 limit 10
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {_col0} {_col1}
+ filter predicates:
+ 0
+ 1 {(_col1.key > 200)}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ Local Work:
+ Map Reduce Local Work
+
Stage: Stage-1
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -237,50 +276,38 @@ STAGE PLANS:
expressions: key (type: string), named_struct('key',key,'value',value) (type: struct)
outputColumnNames: _col0, _col1
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct)
- Map 3
- Map Operator Tree:
- TableScan
- alias: a
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Right Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- filter predicates:
- 0
- 1 {(VALUE._col0.key > 200)}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Limit
- Number of rows: 10
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {_col0} {_col1}
+ filter predicates:
+ 0
+ 1 {(_col1.key > 200)}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 0 Map 2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 10
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -298,16 +325,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
-NULL NULL 0 {"key":"0","value":"val_0"}
-NULL NULL 0 {"key":"0","value":"val_0"}
-NULL NULL 0 {"key":"0","value":"val_0"}
-NULL NULL 105 {"key":"105","value":"val_105"}
-NULL NULL 111 {"key":"111","value":"val_111"}
-NULL NULL 114 {"key":"114","value":"val_114"}
-NULL NULL 12 {"key":"12","value":"val_12"}
-NULL NULL 12 {"key":"12","value":"val_12"}
-NULL NULL 120 {"key":"120","value":"val_120"}
-NULL NULL 120 {"key":"120","value":"val_120"}
+238 val_238 238 {"key":"238","value":"val_238"}
+238 val_238 238 {"key":"238","value":"val_238"}
+NULL NULL 86 {"key":"86","value":"val_86"}
+311 val_311 311 {"key":"311","value":"val_311"}
+311 val_311 311 {"key":"311","value":"val_311"}
+311 val_311 311 {"key":"311","value":"val_311"}
+NULL NULL 27 {"key":"27","value":"val_27"}
+NULL NULL 165 {"key":"165","value":"val_165"}
+409 val_409 409 {"key":"409","value":"val_409"}
+409 val_409 409 {"key":"409","value":"val_409"}
PREHOOK: query: EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND true limit 10
PREHOOK: type: QUERY
@@ -315,14 +342,13 @@ POSTHOOK: query: EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND true limit 10
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -330,47 +356,54 @@ STAGE PLANS:
TableScan
alias: a
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Right Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Limit
- Number of rows: 10
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 10
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -386,16 +419,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-105 val_105 105 val_105
+238 val_238 238 val_238
+238 val_238 238 val_238
+86 val_86 86 val_86
+311 val_311 311 val_311
+311 val_311 311 val_311
+311 val_311 311 val_311
+27 val_27 27 val_27
+165 val_165 165 val_165
+165 val_165 165 val_165
+409 val_409 409 val_409
PREHOOK: query: EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10
PREHOOK: type: QUERY
@@ -403,14 +436,13 @@ POSTHOOK: query: EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -421,13 +453,21 @@ STAGE PLANS:
Filter Operator
predicate: ((key * 10) < '1000') (type: boolean)
Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 3
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
Map Operator Tree:
TableScan
alias: b
@@ -435,36 +475,35 @@ STAGE PLANS:
Filter Operator
predicate: ((key * 10) < '1000') (type: boolean)
Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Right Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE
- Limit
- Number of rows: 10
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {key} {value}
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 10
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -480,16 +519,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-0 val_0 0 val_0
-12 val_12 12 val_12
+86 val_86 86 val_86
+27 val_27 27 val_27
+98 val_98 98 val_98
+98 val_98 98 val_98
+66 val_66 66 val_66
+37 val_37 37 val_37
+37 val_37 37 val_37
+15 val_15 15 val_15
+15 val_15 15 val_15
+82 val_82 82 val_82
PREHOOK: query: EXPLAIN
SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN
(select key, named_struct('key', key, 'value', value) as kv from src) b on a.key=b.key AND b.kv.key > 200 limit 10
@@ -499,14 +538,32 @@ SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN
(select key, named_struct('key', key, 'value', value) as kv from src) b on a.key=b.key AND b.kv.key > 200 limit 10
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1 {_col0} {_col1}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ Local Work:
+ Map Reduce Local Work
+
Stage: Stage-1
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -521,47 +578,35 @@ STAGE PLANS:
Filter Operator
predicate: (_col1.key > 200) (type: boolean)
Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: struct)
- Map 3
- Map Operator Tree:
- TableScan
- alias: a
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Reducer 2
- Reduce Operator Tree:
- Join Operator
- condition map:
- Right Outer Join0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0} {VALUE._col0}
- 1 {KEY.reducesinkkey0} {VALUE._col0}
- outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Select Operator
- expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct)
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Limit
- Number of rows: 10
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Map Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {key} {value}
+ 1 {_col0} {_col1}
+ keys:
+ 0 key (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1, _col5, _col6
+ input vertices:
+ 0 Map 2
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 10
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Local Work:
+ Map Reduce Local Work
Stage: Stage-0
Fetch Operator
@@ -579,13 +624,13 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
-201 val_201 201 {"key":"201","value":"val_201"}
-207 val_207 207 {"key":"207","value":"val_207"}
-207 val_207 207 {"key":"207","value":"val_207"}
-207 val_207 207 {"key":"207","value":"val_207"}
-207 val_207 207 {"key":"207","value":"val_207"}
-213 val_213 213 {"key":"213","value":"val_213"}
-213 val_213 213 {"key":"213","value":"val_213"}
-213 val_213 213 {"key":"213","value":"val_213"}
-213 val_213 213 {"key":"213","value":"val_213"}
-216 val_216 216 {"key":"216","value":"val_216"}
+238 val_238 238 {"key":"238","value":"val_238"}
+238 val_238 238 {"key":"238","value":"val_238"}
+311 val_311 311 {"key":"311","value":"val_311"}
+311 val_311 311 {"key":"311","value":"val_311"}
+311 val_311 311 {"key":"311","value":"val_311"}
+409 val_409 409 {"key":"409","value":"val_409"}
+409 val_409 409 {"key":"409","value":"val_409"}
+409 val_409 409 {"key":"409","value":"val_409"}
+255 val_255 255 {"key":"255","value":"val_255"}
+255 val_255 255 {"key":"255","value":"val_255"}
diff --git ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out
index dc7241c..fcc8d6a 100644
--- ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out
+++ ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out
@@ -11,67 +11,75 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08')
SELECT /*+ MAPJOIN(d) */ DISTINCT c.value
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 5 (PARTITION-LEVEL SORT, 1)
- Reducer 3 <- Reducer 2 (GROUP PARTITION-LEVEL SORT, 1)
- Reducer 4 <- Reducer 3 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 4
Map Operator Tree:
TableScan
- alias: c
+ alias: d
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 5
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1)
+ Reducer 3 <- Reducer 2 (GROUP, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: d
+ alias: c
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col1
+ input vertices:
+ 1 Map 4
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: _col1 (type: string)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: rand() (type: double)
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Local Work:
+ Map Reduce Local Work
Reducer 2
Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {VALUE._col0}
- 1
- outputColumnNames: _col1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- keys: _col1 (type: string)
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: rand() (type: double)
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reducer 3
- Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: string)
mode: partials
@@ -82,7 +90,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reducer 4
+ Reducer 3
Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: string)
@@ -148,66 +156,74 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08')
SELECT /*+ MAPJOIN(d) */ DISTINCT c.value
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: c
+ alias: d
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 4
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 3)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: d
+ alias: c
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col1
+ input vertices:
+ 1 Map 3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: _col1 (type: string)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Local Work:
+ Map Reduce Local Work
Reducer 2
Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {VALUE._col0}
- 1
- outputColumnNames: _col1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Group By Operator
- keys: _col1 (type: string)
- mode: hash
- outputColumnNames: _col0
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reducer 3
- Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: string)
mode: mergepartial
@@ -272,62 +288,70 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08')
SELECT /*+ MAPJOIN(d) */ DISTINCT c.value
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 5 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP PARTITION-LEVEL SORT, 3)
- Reducer 4 <- Reducer 3 (GROUP, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 4
Map Operator Tree:
TableScan
- alias: c
+ alias: d
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 5
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 3)
+ Reducer 3 <- Reducer 2 (GROUP, 3)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: d
+ alias: c
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col1
+ input vertices:
+ 1 Map 4
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col1 (type: string)
+ sort order: +
+ Map-reduce partition columns: rand() (type: double)
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Local Work:
+ Map Reduce Local Work
Reducer 2
Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {VALUE._col0}
- 1
- outputColumnNames: _col1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col1 (type: string)
- sort order: +
- Map-reduce partition columns: rand() (type: double)
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reducer 3
- Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: string)
mode: partial1
@@ -338,7 +362,7 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reducer 4
+ Reducer 3
Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: string)
@@ -404,61 +428,69 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08')
SELECT /*+ MAPJOIN(d) */ DISTINCT c.value
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
+ Map 3
Map Operator Tree:
TableScan
- alias: c
+ alias: d
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- value expressions: value (type: string)
- Map 4
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP, 3)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
Map Operator Tree:
TableScan
- alias: d
+ alias: c
Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: key is not null (type: boolean)
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: string)
- sort order: +
- Map-reduce partition columns: key (type: string)
- Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {value}
+ 1
+ keys:
+ 0 key (type: string)
+ 1 key (type: string)
+ outputColumnNames: _col1
+ input vertices:
+ 1 Map 3
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col1 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: string)
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Local Work:
+ Map Reduce Local Work
Reducer 2
Reduce Operator Tree:
- Join Operator
- condition map:
- Inner Join 0 to 1
- condition expressions:
- 0 {VALUE._col0}
- 1
- outputColumnNames: _col1
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col1 (type: string)
- sort order: +
- Map-reduce partition columns: _col1 (type: string)
- Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
- Reducer 3
- Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: string)
mode: complete
diff --git ql/src/test/results/clientpositive/spark/semijoin.q.out ql/src/test/results/clientpositive/spark/semijoin.q.out
index 2b8e04b..8168832 100644
--- ql/src/test/results/clientpositive/spark/semijoin.q.out
+++ ql/src/test/results/clientpositive/spark/semijoin.q.out
@@ -931,31 +931,16 @@ PREHOOK: type: QUERY
POSTHOOK: query: explain select /*+ mapjoin(b) */ a.key from t3 a left semi join t1 b on a.key = b.key sort by a.key
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (SORT, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
- Map Operator Tree:
- TableScan
- alias: a
- Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE
- Filter Operator
- predicate: key is not null (type: boolean)
- Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE
- Map 4
+ Map 3
Map Operator Tree:
TableScan
alias: b
@@ -972,27 +957,51 @@ STAGE PLANS:
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: int)
- sort order: +
- Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ keys:
+ 0 key (type: int)
+ 1 _col0 (type: int)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (SORT, 3)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ condition expressions:
+ 0 {key}
+ 1
+ keys:
+ 0 key (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col0
+ input vertices:
+ 1 Map 3
+ Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE
+ Local Work:
+ Map Reduce Local Work
Reducer 2
Reduce Operator Tree:
- Join Operator
- condition map:
- Left Semi Join 0 to 1
- condition expressions:
- 0 {KEY.reducesinkkey0}
- 1
- outputColumnNames: _col0
- Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: int)
- sort order: +
- Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE
- Reducer 3
- Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: int)
outputColumnNames: _col0
@@ -1387,31 +1396,16 @@ PREHOOK: type: QUERY
POSTHOOK: query: explain select /*+ mapjoin(b, c) */ a.key from t3 a left semi join t1 b on a.key = b.key left semi join t2 c on a.key = c.key sort by a.key
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
- Stage: Stage-1
+ Stage: Stage-2
Spark
- Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3), Map 5 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (SORT, 3)
#### A masked pattern was here ####
Vertices:
- Map 1
- Map Operator Tree:
- TableScan
- alias: a
- Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE
- Filter Operator
- predicate: key is not null (type: boolean)
- Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: key (type: int)
- sort order: +
- Map-reduce partition columns: key (type: int)
- Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE
- Map 4
+ Map 3
Map Operator Tree:
TableScan
alias: b
@@ -1428,12 +1422,18 @@ STAGE PLANS:
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: int)
- sort order: +
- Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE
- Map 5
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ 2
+ keys:
+ 0 key (type: int)
+ 1 _col0 (type: int)
+ 2 _col0 (type: int)
+ Local Work:
+ Map Reduce Local Work
+ Map 4
Map Operator Tree:
TableScan
alias: c
@@ -1450,29 +1450,57 @@ STAGE PLANS:
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: int)
- sort order: +
- Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1
+ 2
+ keys:
+ 0 key (type: int)
+ 1 _col0 (type: int)
+ 2 _col0 (type: int)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (SORT, 3)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ Left Semi Join 0 to 2
+ condition expressions:
+ 0 {key}
+ 1
+ 2
+ keys:
+ 0 key (type: int)
+ 1 _col0 (type: int)
+ 2 _col0 (type: int)
+ outputColumnNames: _col0
+ input vertices:
+ 1 Map 3
+ 2 Map 4
+ Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE
+ Local Work:
+ Map Reduce Local Work
Reducer 2
Reduce Operator Tree:
- Join Operator
- condition map:
- Left Semi Join 0 to 1
- Left Semi Join 0 to 2
- condition expressions:
- 0 {KEY.reducesinkkey0}
- 1
- 2
- outputColumnNames: _col0
- Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: int)
- sort order: +
- Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE
- Reducer 3
- Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: int)
outputColumnNames: _col0
diff --git ql/src/test/results/clientpositive/spark/skewjoin.q.out ql/src/test/results/clientpositive/spark/skewjoin.q.out
index 56b78be..c944de5 100644
--- ql/src/test/results/clientpositive/spark/skewjoin.q.out
+++ ql/src/test/results/clientpositive/spark/skewjoin.q.out
@@ -949,15 +949,34 @@ POSTHOOK: query: EXPLAIN
SELECT /*+ mapjoin(v)*/ sum(hash(k.key)), sum(hash(v.val)) FROM T1 k LEFT OUTER JOIN T1 v ON k.key+1=v.key
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
- Stage-1 is a root stage
+ Stage-2 is a root stage
+ Stage-1 depends on stages: Stage-2
Stage-0 depends on stages: Stage-1
STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: v
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Spark HashTable Sink Operator
+ condition expressions:
+ 0 {key}
+ 1 {val}
+ keys:
+ 0 (key + 1) (type: double)
+ 1 UDFToDouble(key) (type: double)
+ Local Work:
+ Map Reduce Local Work
+
Stage: Stage-1
Spark
Edges:
- Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3)
- Reducer 3 <- Reducer 2 (GROUP, 1)
+ Reducer 2 <- Map 1 (GROUP, 1)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -965,44 +984,32 @@ STAGE PLANS:
TableScan
alias: k
Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
- Reduce Output Operator
- key expressions: (key + 1) (type: double)
- sort order: +
- Map-reduce partition columns: (key + 1) (type: double)
- Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
- value expressions: key (type: string)
- Map 4
- Map Operator Tree:
- TableScan
- alias: v
- Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
- Reduce Output Operator
- key expressions: UDFToDouble(key) (type: double)
- sort order: +
- Map-reduce partition columns: UDFToDouble(key) (type: double)
- Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
- value expressions: val (type: string)
+ Map Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {key}
+ 1 {val}
+ keys:
+ 0 (key + 1) (type: double)
+ 1 UDFToDouble(key) (type: double)
+ outputColumnNames: _col0, _col6
+ input vertices:
+ 1 Map 3
+ Statistics: Num rows: 0 Data size: 33 Basic stats: PARTIAL Column stats: NONE
+ Group By Operator
+ aggregations: sum(hash(_col0)), sum(hash(_col6))
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: bigint), _col1 (type: bigint)
+ Local Work:
+ Map Reduce Local Work
Reducer 2
Reduce Operator Tree:
- Join Operator
- condition map:
- Left Outer Join0 to 1
- condition expressions:
- 0 {VALUE._col0}
- 1 {VALUE._col1}
- outputColumnNames: _col0, _col6
- Statistics: Num rows: 0 Data size: 33 Basic stats: PARTIAL Column stats: NONE
- Group By Operator
- aggregations: sum(hash(_col0)), sum(hash(_col6))
- mode: hash
- outputColumnNames: _col0, _col1
- Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: bigint), _col1 (type: bigint)
- Reducer 3
- Reduce Operator Tree:
Group By Operator
aggregations: sum(VALUE._col0), sum(VALUE._col1)
mode: mergepartial