diff --git data/conf/spark/hive-site.xml data/conf/spark/hive-site.xml index 44eac86..ee484d1 100644 --- data/conf/spark/hive-site.xml +++ data/conf/spark/hive-site.xml @@ -162,7 +162,7 @@ hive.ignore.mapjoin.hint - true + false Whether Hive ignores the mapjoin hint diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index d6f8267..01a4a32 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -505,7 +505,6 @@ spark.query.files=add_part_multiple.q, \ auto_sortmerge_join_8.q, \ auto_sortmerge_join_9.q, \ auto_sortmerge_join_10.q, \ - auto_sortmerge_join_11.q, \ auto_sortmerge_join_12.q, \ auto_sortmerge_join_13.q, \ auto_sortmerge_join_14.q, \ @@ -523,7 +522,6 @@ spark.query.files=add_part_multiple.q, \ bucketmapjoin3.q, \ bucketmapjoin4.q, \ bucketmapjoin5.q, \ - bucketmapjoin6.q, \ bucketmapjoin7.q, \ bucketmapjoin8.q, \ bucketmapjoin9.q, \ @@ -663,13 +661,11 @@ spark.query.files=add_part_multiple.q, \ join_cond_pushdown_unqual3.q, \ join_cond_pushdown_unqual4.q, \ join_empty.q \ - join_filters.q, \ join_filters_overlap.q, \ join_hive_626.q, \ join_map_ppr.q, \ join_merge_multi_expressions.q, \ join_merging.q, \ - join_nulls.q, \ join_rc.q, \ join_reorder.q, \ join_reorder2.q, \ @@ -799,21 +795,6 @@ spark.query.files=add_part_multiple.q, \ skewjoin_noskew.q, \ skewjoin_union_remove_1.q, \ skewjoin_union_remove_2.q, \ - smb_mapjoin9.q, \ - smb_mapjoin_1.q, \ - smb_mapjoin_2.q, \ - smb_mapjoin_3.q, \ - smb_mapjoin_4.q, \ - smb_mapjoin_5.q, \ - smb_mapjoin_6.q, \ - smb_mapjoin_7.q, \ - smb_mapjoin_8.q, \ - smb_mapjoin_10.q, \ - smb_mapjoin_13.q, \ - smb_mapjoin_14.q, \ - smb_mapjoin_15.q, \ - smb_mapjoin_16.q, \ - smb_mapjoin_17.q, \ smb_mapjoin_18.q, \ smb_mapjoin_19.q, \ smb_mapjoin_20.q, \ @@ -821,14 +802,6 @@ spark.query.files=add_part_multiple.q, \ smb_mapjoin_22.q, \ smb_mapjoin_25.q, \ sort.q, \ - sort_merge_join_desc_1.q, \ - sort_merge_join_desc_2.q, \ - sort_merge_join_desc_3.q, \ - sort_merge_join_desc_4.q, \ - sort_merge_join_desc_5.q, \ - sort_merge_join_desc_6.q, \ - sort_merge_join_desc_7.q, \ - sort_merge_join_desc_8.q, \ spark_test.q, \ stats_counter.q, \ stats_counter_partitioned.q, \ @@ -942,7 +915,6 @@ spark.query.files=add_part_multiple.q, \ vectorization_part_project.q, \ vectorization_pushdown.q, \ vectorization_short_regress.q, \ - vectorized_bucketmapjoin1.q, \ vectorized_case.q, \ vectorized_mapjoin.q, \ vectorized_math_funcs.q, \ diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java index 773c827..1f65a23 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java @@ -29,6 +29,7 @@ import java.util.Set; import java.util.Stack; +import com.clearspring.analytics.util.Preconditions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; @@ -291,6 +292,56 @@ private static void validateMapJoinTypes(Operator op) } /** + * convert a regular join to a a map-side join, for spark branch + * + * @param opParseCtxMap + * @param op join operator + * @param joinTree qb join tree + * @param mapJoinPos position of the source to be read as part of + * map-reduce framework. All other sources are cached in memory + * @param noCheckOuterJoin + */ + public static MapJoinOperator convertMapJoinForSpark(HiveConf conf, + LinkedHashMap, OpParseContext> opParseCtxMap, + JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin, + boolean validateMapJoinTree) throws SemanticException { + + // outer join cannot be performed on a table which is being cached + JoinCondDesc[] condns = op.getConf().getConds(); + + if (!noCheckOuterJoin) { + if (checkMapJoin(mapJoinPos, condns) < 0) { + throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg()); + } + } + + // create the map-join operator + MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, opParseCtxMap, + op, joinTree, mapJoinPos, noCheckOuterJoin); + + List> parentOps = + new ArrayList>(mapJoinOp.getParentOperators()); + for (int i = 0; i < parentOps.size(); i++) { + Operator parentOp = parentOps.get(i); + parentOp.getChildOperators().remove(op); + if (i == mapJoinPos) { + List> grandParentOps = parentOp.getParentOperators(); + Preconditions.checkArgument(grandParentOps.size() == 1, + "AssertionError: expect number of parents to be 0, but was " + grandParentOps.size()); + grandParentOps.get(0).replaceChild(parentOp, mapJoinOp); + mapJoinOp.replaceParent(parentOp, grandParentOps.get(0)); + } + } + + // make sure only map-joins can be performed. + if (validateMapJoinTree) { + validateMapJoinTypes(mapJoinOp); + } + + return mapJoinOp; + } + + /** * convert a regular join to a a map-side join. * * @param opParseCtxMap @@ -497,8 +548,14 @@ public MapJoinOperator generateMapJoinOperator(ParseContext pctx, JoinOperator o LinkedHashMap, OpParseContext> opParseCtxMap = pctx .getOpParseCtx(); - MapJoinOperator mapJoinOp = convertMapJoin(pctx.getConf(), opParseCtxMap, op, - joinTree, mapJoinPos, noCheckOuterJoin, true); + MapJoinOperator mapJoinOp; + if (HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) { + mapJoinOp = convertMapJoinForSpark(pctx.getConf(), opParseCtxMap, op, + joinTree, mapJoinPos, noCheckOuterJoin, false); + } else { + mapJoinOp = convertMapJoin(pctx.getConf(), opParseCtxMap, op, + joinTree, mapJoinPos, noCheckOuterJoin, true); + } // create a dummy select to select all columns genSelectPlan(pctx, mapJoinOp); return mapJoinOp; diff --git ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out index f24ae73..835ee60 100644 --- ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out +++ ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out @@ -104,59 +104,60 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE + alias: b + Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: table1 + base file name: table2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table1 + name default.table2 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table1 { string key, string value} + serialization.ddl struct table2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 20 + totalSize 21 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -166,67 +167,93 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table1 + name default.table2 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table1 { string key, string value} + serialization.ddl struct table2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 20 + totalSize 21 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.table1 - name: default.table1 + name: default.table2 + name: default.table2 Truncated Path -> Alias: - /table1 [a] - Map 4 + /table2 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE + alias: a + Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + input vertices: + 1 Map 3 + Position of Big Table: 0 Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 1 - auto parallelism: false + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: table2 + base file name: table1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table2 + name default.table1 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table2 { string key, string value} + serialization.ddl struct table1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 21 + totalSize 20 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -236,47 +263,26 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table2 + name default.table1 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table2 { string key, string value} + serialization.ddl struct table1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 21 + totalSize 20 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.table2 - name: default.table2 + name: default.table1 + name: default.table1 Truncated Path -> Alias: - /table2 [b] + /table1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out index 33e9e8b..73ba0b4 100644 --- ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out +++ ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out @@ -104,59 +104,60 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE + alias: b + Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: table1 + base file name: table2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table1 + name default.table2 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table1 { string key, string value} + serialization.ddl struct table2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 20 + totalSize 21 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -166,67 +167,93 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table1 + name default.table2 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table1 { string key, string value} + serialization.ddl struct table2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 20 + totalSize 21 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.table1 - name: default.table1 + name: default.table2 + name: default.table2 Truncated Path -> Alias: - /table1 [a] - Map 4 + /table2 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats: NONE + alias: a + Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + input vertices: + 1 Map 3 + Position of Big Table: 0 Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 1 - auto parallelism: false + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: table2 + base file name: table1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table2 + name default.table1 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table2 { string key, string value} + serialization.ddl struct table1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 21 + totalSize 20 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -236,47 +263,26 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 1 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.table2 + name default.table1 numFiles 1 numRows 0 rawDataSize 0 - serialization.ddl struct table2 { string key, string value} + serialization.ddl struct table1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 21 + totalSize 20 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.table2 - name: default.table2 + name: default.table1 + name: default.table1 Truncated Path -> Alias: - /table2 [b] + /table1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out index aaa0151..b18e02f 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out @@ -91,54 +91,26 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Local Work: + Map Reduce Local Work + Stage: Stage-1 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 - Map 3 - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col7 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2 - columns.types int:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -227,54 +199,26 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 - Map 3 - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col7 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2 - columns.types int:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -456,193 +400,196 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: srcbucket_mapjoin + base file name: ds=2008-04-08 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin - numFiles 2 - serialization.ddl struct srcbucket_mapjoin { i32 key, string value} + name default.srcbucket_mapjoin_part + numFiles 4 + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin - numFiles 2 - serialization.ddl struct srcbucket_mapjoin { i32 key, string value} + name default.srcbucket_mapjoin_part + partition_columns ds + partition_columns.types string + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin - name: default.srcbucket_mapjoin + name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin_part Truncated Path -> Alias: - /srcbucket_mapjoin [a] - Map 3 + /srcbucket_mapjoin_part/ds=2008-04-08 [b] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + alias: a + Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: ds=2008-04-08 + base file name: srcbucket_mapjoin input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - partition values: - ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part - numFiles 4 - numRows 0 - partition_columns ds - partition_columns.types string - rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + name default.srcbucket_mapjoin + numFiles 2 + serialization.ddl struct srcbucket_mapjoin { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + COLUMN_STATS_ACCURATE true + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part - partition_columns ds - partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + name default.srcbucket_mapjoin + numFiles 2 + serialization.ddl struct srcbucket_mapjoin { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part - name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin + name: default.srcbucket_mapjoin Truncated Path -> Alias: - /srcbucket_mapjoin_part/ds=2008-04-08 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false + /srcbucket_mapjoin [a] Stage: Stage-0 Move Operator @@ -841,15 +788,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -862,14 +808,16 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 1 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -917,7 +865,12 @@ STAGE PLANS: name: default.srcbucket_mapjoin Truncated Path -> Alias: /srcbucket_mapjoin [a] - Map 3 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b @@ -927,14 +880,57 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Position of Big Table: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + numFiles 1 + numRows 464 + rawDataSize 8519 + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 8983 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -987,52 +983,6 @@ STAGE PLANS: name: default.srcbucket_mapjoin_part Truncated Path -> Alias: /srcbucket_mapjoin_part/ds=2008-04-08 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - numFiles 3 - numRows 464 - rawDataSize 8519 - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 8983 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator @@ -1050,7 +1000,7 @@ STAGE PLANS: columns.types string:string:string #### A masked pattern was here #### name default.bucketmapjoin_tmp_result - numFiles 3 + numFiles 1 numRows 464 rawDataSize 8519 serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out index 9954b77..eda8ca2 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out @@ -192,34 +192,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 1737 Data size: 6950 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -232,22 +233,22 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 3 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 - numFiles 2 + name default.srcbucket_mapjoin_part_2 + numFiles 3 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -260,16 +261,16 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 #### A masked pattern was here #### Partition base file name: part=2 @@ -279,22 +280,22 @@ STAGE PLANS: part 2 properties: COLUMN_STATS_ACCURATE true - bucket_count 3 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 - numFiles 3 + name default.srcbucket_mapjoin_part_2 + numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 4200 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -307,36 +308,62 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - /srcbucket_mapjoin_part_1/part=2 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_2/part=2 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 1737 Data size: 6950 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 869 Data size: 3477 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 955 Data size: 3824 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -349,22 +376,22 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 3 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 3 + name default.srcbucket_mapjoin_part_1 + numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 4200 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -377,16 +404,16 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 #### A masked pattern was here #### Partition base file name: part=2 @@ -396,22 +423,22 @@ STAGE PLANS: part 2 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 3 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 2 + name default.srcbucket_mapjoin_part_1 + numFiles 3 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -424,41 +451,20 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] - /srcbucket_mapjoin_part_2/part=2 [b] + /srcbucket_mapjoin_part_1/part=1 [a] + /srcbucket_mapjoin_part_1/part=2 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 955 Data size: 3824 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out index ad8f0a5..bb7214c 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out @@ -202,34 +202,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -242,44 +243,44 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 - numFiles 2 + name default.srcbucket_mapjoin_part_2 + numFiles 4 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 #### A masked pattern was here #### Partition base file name: part=2 @@ -289,64 +290,90 @@ STAGE PLANS: part 2 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 - numFiles 4 + name default.srcbucket_mapjoin_part_2 + numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - /srcbucket_mapjoin_part_1/part=2 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_2/part=2 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -359,44 +386,44 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 4 + name default.srcbucket_mapjoin_part_1 + numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 #### A masked pattern was here #### Partition base file name: part=2 @@ -406,69 +433,48 @@ STAGE PLANS: part 2 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 2 + name default.srcbucket_mapjoin_part_1 + numFiles 4 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] - /srcbucket_mapjoin_part_2/part=2 [b] + /srcbucket_mapjoin_part_1/part=1 [a] + /srcbucket_mapjoin_part_1/part=2 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator @@ -603,34 +609,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int), part (type: string) - sort order: ++ - Map-reduce partition columns: key (type: int), part (type: string) - Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int), part (type: string) + 1 key (type: int), part (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -643,44 +650,44 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 - numFiles 2 + name default.srcbucket_mapjoin_part_2 + numFiles 4 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 #### A masked pattern was here #### Partition base file name: part=2 @@ -690,64 +697,90 @@ STAGE PLANS: part 2 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 - numFiles 4 + name default.srcbucket_mapjoin_part_2 + numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - /srcbucket_mapjoin_part_1/part=2 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_2/part=2 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 2140 Data size: 8562 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int), part (type: string) - sort order: ++ - Map-reduce partition columns: key (type: int), part (type: string) - Statistics: Num rows: 1070 Data size: 4281 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int), part (type: string) + 1 key (type: int), part (type: string) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -760,44 +793,44 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 4 + name default.srcbucket_mapjoin_part_1 + numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 #### A masked pattern was here #### Partition base file name: part=2 @@ -807,69 +840,48 @@ STAGE PLANS: part 2 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 2 + name default.srcbucket_mapjoin_part_1 + numFiles 4 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] - /srcbucket_mapjoin_part_2/part=2 [b] + /srcbucket_mapjoin_part_1/part=1 [a] + /srcbucket_mapjoin_part_1/part=2 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 1177 Data size: 4709 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out index aa3e2b6..c0adef4 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out @@ -161,34 +161,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -207,13 +208,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -223,41 +224,66 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 - bucket_field_name key + bucket_count -1 columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -276,13 +302,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -292,45 +318,25 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count -1 + bucket_count 2 + bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator @@ -455,34 +461,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -495,19 +502,18 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 - bucket_field_name key + bucket_count -1 columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_3 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -523,35 +529,61 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_3 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_3 + name: default.srcbucket_mapjoin_part_3 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - Map 4 + /srcbucket_mapjoin_part_3/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -564,18 +596,19 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count -1 + bucket_count 2 + bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_3 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -591,40 +624,19 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_3 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_3 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_3 - name: default.srcbucket_mapjoin_part_3 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_3/part=1 [b] + /srcbucket_mapjoin_part_1/part=1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out index 44233f6..98d0706 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out @@ -120,34 +120,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -161,18 +162,18 @@ STAGE PLANS: properties: COLUMN_STATS_ACCURATE true bucket_count 2 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -188,27 +189,75 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 + Truncated Path -> Alias: + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=2 + base file name: part=1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 2 + part 1 properties: COLUMN_STATS_ACCURATE true bucket_count 2 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types int:string @@ -245,36 +294,13 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcbucket_mapjoin_part_1 name: default.srcbucket_mapjoin_part_1 - Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - /srcbucket_mapjoin_part_1/part=2 [a] - Map 4 - Map Operator Tree: - TableScan - alias: b - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - GatherStats: false - Filter Operator - isSamplingPred: false - predicate: key is not null (type: boolean) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false - Path -> Alias: -#### A masked pattern was here #### - Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=1 + base file name: part=2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 1 + part 2 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -283,13 +309,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -305,40 +331,20 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=1 [a] + /srcbucket_mapjoin_part_1/part=2 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator @@ -458,44 +464,45 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=2 + base file name: part=1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 2 + part 1 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -504,13 +511,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -526,45 +533,71 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=2 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=1 + base file name: part=2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 1 + part 2 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -573,13 +606,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -595,40 +628,19 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=2 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator @@ -747,44 +759,45 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=2 + base file name: part=1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 2 + part 1 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -793,13 +806,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -815,45 +828,71 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=2 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=1 + base file name: part=2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 1 + part 2 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -862,13 +901,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -884,40 +923,19 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=2 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator @@ -1038,44 +1056,45 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=2 + base file name: part=1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 2 + part 1 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -1084,13 +1103,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -1101,50 +1120,76 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count 2 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=2 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: part=1 + base file name: part=2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - part 1 + part 2 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -1153,13 +1198,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 500 partition_columns part partition_columns.types string rawDataSize 5312 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 @@ -1170,45 +1215,24 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count 2 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=2 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out index c4702ef..ea763c7 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out @@ -158,35 +158,36 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -199,64 +200,107 @@ STAGE PLANS: ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part - numFiles 4 + name default.srcbucket_mapjoin_part_2 + numFiles 2 numRows 0 partition_columns ds partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 3062 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part + name default.srcbucket_mapjoin_part_2 partition_columns ds partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part - name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part/ds=2008-04-08 [a] - Map 3 + /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE + alias: a + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col7 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -269,87 +313,46 @@ STAGE PLANS: ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 2 + name default.srcbucket_mapjoin_part + numFiles 4 numRows 0 partition_columns ds partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 3062 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part partition_columns ds partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin_part Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col7 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false + /srcbucket_mapjoin_part/ds=2008-04-08 [a] Stage: Stage-0 Move Operator @@ -552,15 +555,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -573,14 +575,16 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 1 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -633,7 +637,12 @@ STAGE PLANS: name: default.srcbucket_mapjoin_part Truncated Path -> Alias: /srcbucket_mapjoin_part/ds=2008-04-08 [a] - Map 3 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b @@ -643,14 +652,57 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col7 + input vertices: + 0 Map 1 + Position of Big Table: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + numFiles 1 + numRows 564 + rawDataSize 10503 + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11067 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -703,52 +755,6 @@ STAGE PLANS: name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col7 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - numFiles 3 - numRows 564 - rawDataSize 10503 - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 11067 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator @@ -766,7 +772,7 @@ STAGE PLANS: columns.types string:string:string #### A masked pattern was here #### name default.bucketmapjoin_tmp_result - numFiles 3 + numFiles 1 numRows 564 rawDataSize 10503 serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} @@ -968,35 +974,36 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -1009,74 +1016,51 @@ STAGE PLANS: ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part - numFiles 4 + name default.srcbucket_mapjoin_part_2 + numFiles 2 numRows 0 partition_columns ds partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 3062 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part + name default.srcbucket_mapjoin_part_2 partition_columns ds partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part - name: default.srcbucket_mapjoin_part - Truncated Path -> Alias: - /srcbucket_mapjoin_part/ds=2008-04-08 [a] - Map 3 - Map Operator Tree: - TableScan - alias: b - Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE - GatherStats: false - Filter Operator - isSamplingPred: false - predicate: key is not null (type: boolean) - Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false - Path -> Alias: -#### A masked pattern was here #### - Path -> Partition: + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 #### A masked pattern was here #### Partition - base file name: ds=2008-04-08 + base file name: ds=2008-04-09 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - ds 2008-04-08 + ds 2008-04-09 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -1117,102 +1101,127 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcbucket_mapjoin_part_2 name: default.srcbucket_mapjoin_part_2 + Truncated Path -> Alias: + /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] + /srcbucket_mapjoin_part_2/ds=2008-04-09 [b] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: key is not null (type: boolean) + Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col7 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + numFiles 1 + numRows 564 + rawDataSize 10503 + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11067 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: #### A masked pattern was here #### Partition - base file name: ds=2008-04-09 + base file name: ds=2008-04-08 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - ds 2008-04-09 + ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 2 + name default.srcbucket_mapjoin_part + numFiles 4 numRows 0 partition_columns ds partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 3062 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part partition_columns ds partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin_part Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] - /srcbucket_mapjoin_part_2/ds=2008-04-09 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col7 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - numFiles 3 - numRows 564 - rawDataSize 10503 - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 11067 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false + /srcbucket_mapjoin_part/ds=2008-04-08 [a] Stage: Stage-0 Move Operator @@ -1230,7 +1239,7 @@ STAGE PLANS: columns.types string:string:string #### A masked pattern was here #### name default.bucketmapjoin_tmp_result - numFiles 3 + numFiles 1 numRows 564 rawDataSize 10503 serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out index 7c31e05..1b31561 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out @@ -189,35 +189,36 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -230,64 +231,107 @@ STAGE PLANS: ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 2 + name default.srcbucket_mapjoin_part + numFiles 4 numRows 0 partition_columns ds partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 3062 + totalSize 5812 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 4 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part partition_columns ds partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin_part Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/ds=2008-04-08 [a] - Map 3 + /srcbucket_mapjoin_part/ds=2008-04-08 [b] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + alias: a + Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col7 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -300,87 +344,46 @@ STAGE PLANS: ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part - numFiles 4 + name default.srcbucket_mapjoin_part_2 + numFiles 2 numRows 0 partition_columns ds partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 + totalSize 3062 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 4 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part + name default.srcbucket_mapjoin_part_2 partition_columns ds partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part - name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part/ds=2008-04-08 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col7 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false + /srcbucket_mapjoin_part_2/ds=2008-04-08 [a] Stage: Stage-0 Move Operator @@ -590,15 +593,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -611,14 +613,16 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 15 Data size: 1583 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 1 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -671,7 +675,12 @@ STAGE PLANS: name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: /srcbucket_mapjoin_part_2/ds=2008-04-08 [a] - Map 3 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b @@ -681,14 +690,57 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 28 Data size: 2958 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col7 + input vertices: + 0 Map 1 + Position of Big Table: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + numFiles 1 + numRows 564 + rawDataSize 10503 + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 11067 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -741,52 +793,6 @@ STAGE PLANS: name: default.srcbucket_mapjoin_part Truncated Path -> Alias: /srcbucket_mapjoin_part/ds=2008-04-08 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col7 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 30 Data size: 3253 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - numFiles 3 - numRows 564 - rawDataSize 10503 - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 11067 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator @@ -804,7 +810,7 @@ STAGE PLANS: columns.types string:string:string #### A masked pattern was here #### name default.bucketmapjoin_tmp_result - numFiles 3 + numFiles 1 numRows 564 rawDataSize 10503 serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out index a8e892e..97d2d74 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out @@ -175,35 +175,36 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -250,25 +251,68 @@ STAGE PLANS: name: default.srcbucket_mapjoin name: default.srcbucket_mapjoin Truncated Path -> Alias: - /srcbucket_mapjoin [a] - Map 3 + /srcbucket_mapjoin [b] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -315,48 +359,7 @@ STAGE PLANS: name: default.srcbucket_mapjoin name: default.srcbucket_mapjoin Truncated Path -> Alias: - /srcbucket_mapjoin [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false + /srcbucket_mapjoin [a] Stage: Stage-0 Move Operator @@ -540,15 +543,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -561,14 +563,16 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 1 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -616,7 +620,12 @@ STAGE PLANS: name: default.srcbucket_mapjoin Truncated Path -> Alias: /srcbucket_mapjoin [a] - Map 3 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b @@ -626,14 +635,57 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Position of Big Table: 1 + Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + numFiles 2 + numRows 464 + rawDataSize 8519 + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 8983 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -681,52 +733,6 @@ STAGE PLANS: name: default.srcbucket_mapjoin Truncated Path -> Alias: /srcbucket_mapjoin [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 14 Data size: 1512 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - numFiles 3 - numRows 464 - rawDataSize 8519 - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 8983 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator @@ -744,7 +750,7 @@ STAGE PLANS: columns.types string:string:string #### A masked pattern was here #### name default.bucketmapjoin_tmp_result - numFiles 3 + numFiles 2 numRows 464 rawDataSize 8519 serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out index 041ba12..94952a1 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out @@ -225,15 +225,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -246,14 +245,16 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 1 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -301,7 +302,12 @@ STAGE PLANS: name: default.srcbucket_mapjoin Truncated Path -> Alias: /srcbucket_mapjoin [a] - Map 3 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b @@ -311,14 +317,52 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Position of Big Table: 1 + Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -419,47 +463,6 @@ STAGE PLANS: Truncated Path -> Alias: /srcbucket_mapjoin_part/ds=2008-04-08 [b] /srcbucket_mapjoin_part/ds=2008-04-09 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 60 Data size: 6393 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator @@ -655,15 +658,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -676,14 +678,16 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 1 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -731,7 +735,12 @@ STAGE PLANS: name: default.srcbucket_mapjoin Truncated Path -> Alias: /srcbucket_mapjoin [a] - Map 3 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b @@ -741,14 +750,57 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Position of Big Table: 1 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + numFiles 1 + numRows 928 + rawDataSize 17038 + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 17966 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -849,52 +901,6 @@ STAGE PLANS: Truncated Path -> Alias: /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] /srcbucket_mapjoin_part_2/ds=2008-04-09 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - numFiles 3 - numRows 928 - rawDataSize 17038 - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 17966 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator @@ -912,7 +918,7 @@ STAGE PLANS: columns.types string:string:string #### A masked pattern was here #### name default.bucketmapjoin_tmp_result - numFiles 3 + numFiles 1 numRows 928 rawDataSize 17038 serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out index 54c4be3..ca59d02 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out @@ -140,34 +140,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -187,13 +188,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -209,36 +210,61 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns ds/hr partition_columns.types string:string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/ds=2008-04-08/hr=0 [a] - Map 4 + /srcbucket_mapjoin_part_2/ds=2008-04-08/hr=0 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (SORT, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE + alias: a + Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col8 + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col8 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: string) + sort order: ++ + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + tag: -1 + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -258,13 +284,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -280,40 +306,19 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns ds/hr partition_columns.types string:string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/ds=2008-04-08/hr=0 [b] + /srcbucket_mapjoin_part_1/ds=2008-04-08/hr=0 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col8 - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col8 (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int), _col1 (type: string) - sort order: ++ - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - tag: -1 - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Select Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out index da9fe1c..f419eaf 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out @@ -126,34 +126,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -172,13 +173,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -188,41 +189,67 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 2 + bucket_count 3 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -241,13 +268,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -257,46 +284,25 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 3 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator @@ -429,34 +435,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -475,13 +482,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -492,40 +499,66 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count 2 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -544,13 +577,13 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -561,45 +594,24 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count 2 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out index 5a5e3f6..f439a14 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out @@ -134,34 +134,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 1050 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Statistics: Num rows: 525 Data size: 2100 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -174,22 +175,22 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 3 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 - numFiles 2 + name default.srcbucket_mapjoin_part_2 + numFiles 3 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -202,35 +203,61 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 1050 Data size: 4200 Basic stats: COMPLETE Column stats: NONE + alias: a + Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 525 Data size: 2100 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 525 Data size: 2100 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 577 Data size: 2310 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -243,22 +270,22 @@ STAGE PLANS: part 1 properties: COLUMN_STATS_ACCURATE true - bucket_count 3 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - numFiles 3 + name default.srcbucket_mapjoin_part_1 + numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 4200 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe @@ -271,40 +298,19 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 577 Data size: 2310 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator @@ -470,34 +476,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: a + alias: b Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -511,18 +518,18 @@ STAGE PLANS: properties: COLUMN_STATS_ACCURATE true bucket_count 2 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -538,35 +545,61 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_1 + name default.srcbucket_mapjoin_part_2 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_1 - name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 Truncated Path -> Alias: - /srcbucket_mapjoin_part_1/part=1 [a] - Map 4 + /srcbucket_mapjoin_part_2/part=1 [b] + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b + alias: a Statistics: Num rows: 687 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 344 Data size: 1377 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + keys: + 0 key (type: int) + 1 key (type: int) + input vertices: + 1 Map 3 + Position of Big Table: 0 + Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -580,18 +613,18 @@ STAGE PLANS: properties: COLUMN_STATS_ACCURATE true bucket_count 2 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 numFiles 2 numRows 0 partition_columns part partition_columns.types string rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 2750 @@ -607,40 +640,19 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin_part_1 partition_columns part partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_1 + name: default.srcbucket_mapjoin_part_1 Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/part=1 [b] + /srcbucket_mapjoin_part_1/part=1 [a] Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 - 1 - Statistics: Num rows: 378 Data size: 1514 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - tag: -1 - value expressions: _col0 (type: bigint) - auto parallelism: false - Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out index 5ac3f4c..a2bf84c 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out @@ -133,193 +133,196 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 40 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 20 Data size: 2100 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: srcbucket_mapjoin + base file name: ds=2008-04-08 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 3 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin - numFiles 2 - serialization.ddl struct srcbucket_mapjoin { i32 key, string value} + name default.srcbucket_mapjoin_part + numFiles 3 + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - COLUMN_STATS_ACCURATE true - bucket_count 2 + bucket_count 3 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin - numFiles 2 - serialization.ddl struct srcbucket_mapjoin { i32 key, string value} + name default.srcbucket_mapjoin_part + partition_columns ds + partition_columns.types string + serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin - name: default.srcbucket_mapjoin + name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin_part Truncated Path -> Alias: - /srcbucket_mapjoin [a] - Map 3 + /srcbucket_mapjoin_part/ds=2008-04-08 [b] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 40 Data size: 4200 Basic stats: COMPLETE Column stats: NONE + alias: a + Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 20 Data size: 2100 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 20 Data size: 2100 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: ds=2008-04-08 + base file name: srcbucket_mapjoin input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - partition values: - ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true - bucket_count 3 + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part - numFiles 3 - numRows 0 - partition_columns ds - partition_columns.types string - rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + name default.srcbucket_mapjoin + numFiles 2 + serialization.ddl struct srcbucket_mapjoin { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 4200 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - bucket_count 3 + COLUMN_STATS_ACCURATE true + bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part - partition_columns ds - partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part { i32 key, string value} + name default.srcbucket_mapjoin + numFiles 2 + serialization.ddl struct srcbucket_mapjoin { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part - name: default.srcbucket_mapjoin_part + name: default.srcbucket_mapjoin + name: default.srcbucket_mapjoin Truncated Path -> Alias: - /srcbucket_mapjoin_part/ds=2008-04-08 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 22 Data size: 2310 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false + /srcbucket_mapjoin [a] Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out index e4ff965..b954feb 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out @@ -135,43 +135,46 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: key is not null (type: boolean) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: value (type: string) - auto parallelism: false + Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: srcbucket_mapjoin + base file name: ds=2008-04-08 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -180,65 +183,45 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin + name default.srcbucket_mapjoin_part_2 numFiles 2 - serialization.ddl struct srcbucket_mapjoin { i32 key, string value} + numRows 0 + partition_columns ds + partition_columns.types string + rawDataSize 0 + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 + totalSize 3062 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: - COLUMN_STATS_ACCURATE true bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin - numFiles 2 - serialization.ddl struct srcbucket_mapjoin { i32 key, string value} + name default.srcbucket_mapjoin_part_2 + partition_columns ds + partition_columns.types string + serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin - name: default.srcbucket_mapjoin - Truncated Path -> Alias: - /srcbucket_mapjoin [a] - Map 3 - Map Operator Tree: - TableScan - alias: b - Statistics: Num rows: 58 Data size: 6124 Basic stats: COMPLETE Column stats: NONE - GatherStats: false - Filter Operator - isSamplingPred: false - predicate: key is not null (type: boolean) - Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 29 Data size: 3062 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false - Path -> Alias: -#### A masked pattern was here #### - Path -> Partition: + name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin_part_2 #### A masked pattern was here #### Partition - base file name: ds=2008-04-08 + base file name: ds=2008-04-09 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: - ds 2008-04-08 + ds 2008-04-09 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -279,13 +262,78 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcbucket_mapjoin_part_2 name: default.srcbucket_mapjoin_part_2 + Truncated Path -> Alias: + /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] + /srcbucket_mapjoin_part_2/ds=2008-04-09 [b] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 26 Data size: 2750 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: key is not null (type: boolean) + Statistics: Num rows: 13 Data size: 1375 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value1,value2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.bucketmapjoin_tmp_result + serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.bucketmapjoin_tmp_result + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: #### A masked pattern was here #### Partition - base file name: ds=2008-04-09 + base file name: srcbucket_mapjoin input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - partition values: - ds 2008-04-09 properties: COLUMN_STATS_ACCURATE true bucket_count 2 @@ -294,82 +342,37 @@ STAGE PLANS: columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 + name default.srcbucket_mapjoin numFiles 2 - numRows 0 - partition_columns ds - partition_columns.types string - rawDataSize 0 - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + serialization.ddl struct srcbucket_mapjoin { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 3062 + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: + COLUMN_STATS_ACCURATE true bucket_count 2 bucket_field_name key columns key,value columns.comments columns.types int:string #### A masked pattern was here #### - name default.srcbucket_mapjoin_part_2 - partition_columns ds - partition_columns.types string - serialization.ddl struct srcbucket_mapjoin_part_2 { i32 key, string value} + name default.srcbucket_mapjoin + numFiles 2 + serialization.ddl struct srcbucket_mapjoin { i32 key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2750 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.srcbucket_mapjoin_part_2 - name: default.srcbucket_mapjoin_part_2 + name: default.srcbucket_mapjoin + name: default.srcbucket_mapjoin Truncated Path -> Alias: - /srcbucket_mapjoin_part_2/ds=2008-04-08 [b] - /srcbucket_mapjoin_part_2/ds=2008-04-09 [b] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 31 Data size: 3368 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value1,value2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.bucketmapjoin_tmp_result - serialization.ddl struct bucketmapjoin_tmp_result { string key, string value1, string value2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.bucketmapjoin_tmp_result - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false + /srcbucket_mapjoin [a] Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out index fce5566..bfe5438 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out @@ -195,33 +195,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -270,24 +272,63 @@ STAGE PLANS: name: default.test1 name: default.test1 Truncated Path -> Alias: - /test1 [l] - Map 3 + /test1 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -336,43 +377,7 @@ STAGE PLANS: name: default.test1 name: default.test1 Truncated Path -> Alias: - /test1 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test1 [l] Stage: Stage-0 Fetch Operator @@ -431,33 +436,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -506,24 +513,63 @@ STAGE PLANS: name: default.test2 name: default.test2 Truncated Path -> Alias: - /test2 [l] - Map 3 + /test2 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -572,43 +618,7 @@ STAGE PLANS: name: default.test2 name: default.test2 Truncated Path -> Alias: - /test2 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test2 [l] Stage: Stage-0 Fetch Operator @@ -664,34 +674,35 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false - predicate: (key + key) is not null (type: boolean) + predicate: UDFToDouble(key) is not null (type: boolean) Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: (key + key) (type: double) - sort order: + - Map-reduce partition columns: (key + key) (type: double) - Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: key (type: string), value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 (key + key) (type: double) + 1 UDFToDouble(key) (type: double) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -740,25 +751,63 @@ STAGE PLANS: name: default.test1 name: default.test1 Truncated Path -> Alias: - /test1 [l] - Map 3 + /test1 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false - predicate: UDFToDouble(key) is not null (type: boolean) + predicate: (key + key) is not null (type: boolean) Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: UDFToDouble(key) (type: double) - sort order: + - Map-reduce partition columns: UDFToDouble(key) (type: double) - Statistics: Num rows: 11 Data size: 2200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: key (type: string), value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 (key + key) (type: double) + 1 UDFToDouble(key) (type: double) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -807,43 +856,7 @@ STAGE PLANS: name: default.test1 name: default.test1 Truncated Path -> Alias: - /test1 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {VALUE._col0} {VALUE._col1} - 1 {VALUE._col0} {VALUE._col1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 12 Data size: 2420 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test1 [l] Stage: Stage-0 Fetch Operator @@ -902,53 +915,55 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test1 + base file name: test2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test1 + name default.test2 numFiles 3 - serialization.ddl struct test1 { string key, string value} + serialization.ddl struct test2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -961,60 +976,99 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test1 + name default.test2 numFiles 3 - serialization.ddl struct test1 { string key, string value} + serialization.ddl struct test2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test1 - name: default.test1 + name: default.test2 + name: default.test2 Truncated Path -> Alias: - /test1 [l] - Map 3 + /test2 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test2 + base file name: test1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test2 + name default.test1 numFiles 3 - serialization.ddl struct test2 { string key, string value} + serialization.ddl struct test1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1027,59 +1081,23 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test2 + name default.test1 numFiles 3 - serialization.ddl struct test2 { string key, string value} + serialization.ddl struct test1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test2 - name: default.test2 + name: default.test1 + name: default.test1 Truncated Path -> Alias: - /test2 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test1 [l] Stage: Stage-0 Fetch Operator @@ -1138,39 +1156,41 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test1 + base file name: test3 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: @@ -1182,9 +1202,9 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test1 + name default.test3 numFiles 3 - serialization.ddl struct test1 { string key, string value} + serialization.ddl struct test3 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1202,41 +1222,80 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test1 + name default.test3 numFiles 3 - serialization.ddl struct test1 { string key, string value} + serialization.ddl struct test3 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test1 - name: default.test1 + name: default.test3 + name: default.test3 Truncated Path -> Alias: - /test1 [l] - Map 3 + /test3 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test3 + base file name: test1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: @@ -1248,9 +1307,9 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test3 + name default.test1 numFiles 3 - serialization.ddl struct test3 { string key, string value} + serialization.ddl struct test1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1268,54 +1327,18 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test3 + name default.test1 numFiles 3 - serialization.ddl struct test3 { string key, string value} + serialization.ddl struct test1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test3 - name: default.test3 + name: default.test1 + name: default.test1 Truncated Path -> Alias: - /test3 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test1 [l] Stage: Stage-0 Fetch Operator @@ -1374,53 +1397,55 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test1 + base file name: test4 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test1 + name default.test4 numFiles 3 - serialization.ddl struct test1 { string key, string value} + serialization.ddl struct test4 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1433,60 +1458,99 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test1 + name default.test4 numFiles 3 - serialization.ddl struct test1 { string key, string value} + serialization.ddl struct test4 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test1 - name: default.test1 + name: default.test4 + name: default.test4 Truncated Path -> Alias: - /test1 [l] - Map 3 + /test4 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test4 + base file name: test1 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test4 + name default.test1 numFiles 3 - serialization.ddl struct test4 { string key, string value} + serialization.ddl struct test1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1499,59 +1563,23 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test4 + name default.test1 numFiles 3 - serialization.ddl struct test4 { string key, string value} + serialization.ddl struct test1 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test4 - name: default.test4 + name: default.test1 + name: default.test1 Truncated Path -> Alias: - /test4 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test1 [l] Stage: Stage-0 Fetch Operator @@ -1610,53 +1638,55 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test2 + base file name: test3 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test2 + name default.test3 numFiles 3 - serialization.ddl struct test2 { string key, string value} + serialization.ddl struct test3 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1669,60 +1699,99 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test2 + name default.test3 numFiles 3 - serialization.ddl struct test2 { string key, string value} + serialization.ddl struct test3 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test2 - name: default.test2 + name: default.test3 + name: default.test3 Truncated Path -> Alias: - /test2 [l] - Map 3 + /test3 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test3 + base file name: test2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test3 + name default.test2 numFiles 3 - serialization.ddl struct test3 { string key, string value} + serialization.ddl struct test2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1735,59 +1804,23 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test3 + name default.test2 numFiles 3 - serialization.ddl struct test3 { string key, string value} + serialization.ddl struct test2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test3 - name: default.test3 + name: default.test2 + name: default.test2 Truncated Path -> Alias: - /test3 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test2 [l] Stage: Stage-0 Fetch Operator @@ -1846,39 +1879,41 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test2 + base file name: test4 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: @@ -1890,9 +1925,9 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test2 + name default.test4 numFiles 3 - serialization.ddl struct test2 { string key, string value} + serialization.ddl struct test4 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1910,41 +1945,80 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test2 + name default.test4 numFiles 3 - serialization.ddl struct test2 { string key, string value} + serialization.ddl struct test4 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test2 - name: default.test2 + name: default.test4 + name: default.test4 Truncated Path -> Alias: - /test2 [l] - Map 3 + /test4 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test4 + base file name: test2 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: @@ -1956,9 +2030,9 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test4 + name default.test2 numFiles 3 - serialization.ddl struct test4 { string key, string value} + serialization.ddl struct test2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -1976,54 +2050,18 @@ STAGE PLANS: columns.comments columns.types string:string #### A masked pattern was here #### - name default.test4 + name default.test2 numFiles 3 - serialization.ddl struct test4 { string key, string value} + serialization.ddl struct test2 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test4 - name: default.test4 + name: default.test2 + name: default.test2 Truncated Path -> Alias: - /test4 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test2 [l] Stage: Stage-0 Fetch Operator @@ -2082,53 +2120,55 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: l + alias: r Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test3 + base file name: test4 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test3 + name default.test4 numFiles 3 - serialization.ddl struct test3 { string key, string value} + serialization.ddl struct test4 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -2141,60 +2181,99 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name key + bucket_field_name value columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test3 + name default.test4 numFiles 3 - serialization.ddl struct test3 { string key, string value} + serialization.ddl struct test4 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test3 - name: default.test3 + name: default.test4 + name: default.test4 Truncated Path -> Alias: - /test3 [l] - Map 3 + /test4 [r] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: r + alias: l Statistics: Num rows: 21 Data size: 4200 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string), value (type: string) - sort order: ++ - Map-reduce partition columns: key (type: string), value (type: string) - Statistics: Num rows: 6 Data size: 1200 Basic stats: COMPLETE Column stats: NONE - tag: 1 - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string), value (type: string) + 1 key (type: string), value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 2 + Position of Big Table: 0 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: #### A masked pattern was here #### Partition - base file name: test4 + base file name: test3 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test4 + name default.test3 numFiles 3 - serialization.ddl struct test4 { string key, string value} + serialization.ddl struct test3 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 @@ -2207,59 +2286,23 @@ STAGE PLANS: COLUMN_STATS_ACCURATE true SORTBUCKETCOLSPREFIX TRUE bucket_count 3 - bucket_field_name value + bucket_field_name key columns key,value columns.comments columns.types string:string #### A masked pattern was here #### - name default.test4 + name default.test3 numFiles 3 - serialization.ddl struct test4 { string key, string value} + serialization.ddl struct test3 { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 4200 #### A masked pattern was here #### serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.test4 - name: default.test4 + name: default.test3 + name: default.test3 Truncated Path -> Alias: - /test4 [r] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 6 Data size: 1320 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /test3 [l] Stage: Stage-0 Fetch Operator diff --git ql/src/test/results/clientpositive/spark/join25.q.out ql/src/test/results/clientpositive/spark/join25.q.out index 284c97d..84e496a 100644 --- ql/src/test/results/clientpositive/spark/join25.q.out +++ ql/src/test/results/clientpositive/spark/join25.q.out @@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(x) */ x.key, x.value, y.value FROM src1 x JOIN src y ON (x.key = y.key) POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -40,13 +39,21 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: y @@ -54,34 +61,33 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + Local Work: + Map Reduce Local Work Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/join26.q.out ql/src/test/results/clientpositive/spark/join26.q.out index e271184..8731942 100644 --- ql/src/test/results/clientpositive/spark/join26.q.out +++ ql/src/test/results/clientpositive/spark/join26.q.out @@ -102,15 +102,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -123,13 +122,18 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 {value} + 2 {value} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + Position of Big Table: 2 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -179,7 +183,7 @@ STAGE PLANS: name: default.src1 Truncated Path -> Alias: /src1 [x] - Map 3 + Map 2 Map Operator Tree: TableScan alias: y @@ -189,14 +193,18 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 {value} + 2 {value} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + Position of Big Table: 2 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -246,7 +254,12 @@ STAGE PLANS: name: default.src Truncated Path -> Alias: /src [y] - Map 4 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 3 Map Operator Tree: TableScan alias: z @@ -256,14 +269,56 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 2 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {key} + 1 {value} + 2 {value} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + outputColumnNames: _col0, _col6, _col11 + input vertices: + 0 Map 1 + 1 Map 2 + Position of Big Table: 2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value,val2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.dest_j1 + serialization.ddl struct dest_j1 { string key, string value, string val2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -315,49 +370,6 @@ STAGE PLANS: name: default.srcpart Truncated Path -> Alias: /srcpart/ds=2008-04-08/hr=11 [z] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - Inner Join 0 to 2 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 {VALUE._col0} - 2 {VALUE._col0} - outputColumnNames: _col0, _col6, _col11 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value,val2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.dest_j1 - serialization.ddl struct dest_j1 { string key, string value, string val2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/join27.q.out ql/src/test/results/clientpositive/spark/join27.q.out index d31f29e..a578279 100644 --- ql/src/test/results/clientpositive/spark/join27.q.out +++ ql/src/test/results/clientpositive/spark/join27.q.out @@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(x) */ x.key, x.value, y.value FROM src1 x JOIN src y ON (x.value = y.value) POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -40,13 +39,21 @@ STAGE PLANS: Filter Operator predicate: value is not null (type: boolean) Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: value (type: string) - sort order: + - Map-reduce partition columns: value (type: string) - Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 {value} + keys: + 0 value (type: string) + 1 value (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: y @@ -54,33 +61,33 @@ STAGE PLANS: Filter Operator predicate: value is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: value (type: string) - sort order: + - Map-reduce partition columns: value (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {VALUE._col0} {KEY.reducesinkkey0} - 1 {KEY.reducesinkkey0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 value (type: string) + 1 value (type: string) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + Local Work: + Map Reduce Local Work Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/join30.q.out ql/src/test/results/clientpositive/spark/join30.q.out index 7fbbcfa..f6d2117 100644 --- ql/src/test/results/clientpositive/spark/join30.q.out +++ ql/src/test/results/clientpositive/spark/join30.q.out @@ -19,16 +19,14 @@ INSERT OVERWRITE TABLE dest_j1 SELECT /*+ MAPJOIN(x) */ x.key, count(1) FROM src1 x JOIN src y ON (x.key = y.key) group by x.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -39,12 +37,23 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Map 4 + Spark HashTable Sink Operator + condition expressions: + 0 + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 3 <- Map 2 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: y @@ -52,33 +61,33 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 - outputColumnNames: _col0 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count(1) - keys: _col0 (type: string) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0 + input vertices: + 0 Map 1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Local Work: + Map Reduce Local Work Reducer 3 Reduce Operator Tree: Group By Operator diff --git ql/src/test/results/clientpositive/spark/join36.q.out ql/src/test/results/clientpositive/spark/join36.q.out index f1317ea..4850951 100644 --- ql/src/test/results/clientpositive/spark/join36.q.out +++ ql/src/test/results/clientpositive/spark/join36.q.out @@ -61,15 +61,14 @@ SELECT /*+ MAPJOIN(x) */ x.key, x.cnt, y.cnt FROM tmp1 x JOIN tmp2 y ON (x.key = y.key) POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -80,13 +79,21 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE - value expressions: cnt (type: int) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {cnt} + 1 {cnt} + keys: + 0 key (type: int) + 1 key (type: int) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: y @@ -94,34 +101,33 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 155 Data size: 743 Basic stats: COMPLETE Column stats: NONE - value expressions: cnt (type: int) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: int), _col1 (type: int), _col6 (type: int) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {cnt} + 1 {cnt} + keys: + 0 key (type: int) + 1 key (type: int) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: int), _col6 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 170 Data size: 817 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + Local Work: + Map Reduce Local Work Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/join37.q.out ql/src/test/results/clientpositive/spark/join37.q.out index 448e983..bdb9aa6 100644 --- ql/src/test/results/clientpositive/spark/join37.q.out +++ ql/src/test/results/clientpositive/spark/join37.q.out @@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(X) */ x.key, x.value, y.value FROM src1 x JOIN src y ON (x.key = y.key) POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -40,13 +39,21 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: y @@ -54,34 +61,33 @@ STAGE PLANS: Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1, _col6 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {value} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + Local Work: + Map Reduce Local Work Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/join38.q.out ql/src/test/results/clientpositive/spark/join38.q.out index 735d7ea..330800e 100644 --- ql/src/test/results/clientpositive/spark/join38.q.out +++ ql/src/test/results/clientpositive/spark/join38.q.out @@ -58,14 +58,14 @@ STAGE PLANS: Spark #### A masked pattern was here #### Vertices: - Map 3 + Map 1 Map Operator Tree: TableScan - alias: b - Statistics: Num rows: 2 Data size: 126 Basic stats: COMPLETE Column stats: NONE + alias: a + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (col11 = 111) (type: boolean) - Statistics: Num rows: 1 Data size: 63 Basic stats: COMPLETE Column stats: NONE + predicate: (key = 111) (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Spark HashTable Sink Operator condition expressions: 0 {value} @@ -79,17 +79,17 @@ STAGE PLANS: Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (GROUP, 3) + Reducer 3 <- Map 2 (GROUP, 3) #### A masked pattern was here #### Vertices: - Map 1 + Map 2 Map Operator Tree: TableScan - alias: a - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + alias: b + Statistics: Num rows: 2 Data size: 126 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (key = 111) (type: boolean) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + predicate: (col11 = 111) (type: boolean) + Statistics: Num rows: 1 Data size: 63 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 @@ -101,7 +101,7 @@ STAGE PLANS: 1 '111' (type: string) outputColumnNames: _col1, _col10 input vertices: - 1 Map 3 + 0 Map 1 Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count(1) @@ -117,7 +117,7 @@ STAGE PLANS: value expressions: _col2 (type: bigint) Local Work: Map Reduce Local Work - Reducer 2 + Reducer 3 Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) diff --git ql/src/test/results/clientpositive/spark/join39.q.out ql/src/test/results/clientpositive/spark/join39.q.out index 0734d4b..416c92e 100644 --- ql/src/test/results/clientpositive/spark/join39.q.out +++ ql/src/test/results/clientpositive/spark/join39.q.out @@ -21,15 +21,14 @@ SELECT /*+ MAPJOIN(y) */ x.key, x.value, y.key, y.value FROM src x left outer JOIN (select * from src where key <= 100) y ON (x.key = y.key) POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -44,45 +43,52 @@ STAGE PLANS: expressions: key (type: string), value (type: string) outputColumnNames: _col0, _col1 Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {_col1} + keys: + 0 key (type: string) + 1 _col0 (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: x Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {_col0} {_col1} + keys: + 0 key (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 1 Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + Local Work: + Map Reduce Local Work Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/join40.q.out ql/src/test/results/clientpositive/spark/join40.q.out index 60ef13d..6e990c6 100644 --- ql/src/test/results/clientpositive/spark/join40.q.out +++ ql/src/test/results/clientpositive/spark/join40.q.out @@ -3093,14 +3093,13 @@ SELECT /*+ MAPJOIN(y) */ x.key, x.value, y.key, y.value FROM src x left outer JOIN (select * from src where key <= 100) y ON (x.key = y.key) POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -3115,44 +3114,51 @@ STAGE PLANS: expressions: key (type: string), value (type: string) outputColumnNames: _col0, _col1 Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {_col1} + keys: + 0 key (type: string) + 1 _col0 (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: x Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {_col0} {_col1} + keys: + 0 key (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 1 Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator diff --git ql/src/test/results/clientpositive/spark/join_map_ppr.q.out ql/src/test/results/clientpositive/spark/join_map_ppr.q.out index 59fdb99..91974f1 100644 --- ql/src/test/results/clientpositive/spark/join_map_ppr.q.out +++ ql/src/test/results/clientpositive/spark/join_map_ppr.q.out @@ -104,15 +104,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -125,13 +124,18 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - tag: 0 - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 + 1 {value} + 2 {value} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + Position of Big Table: 2 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -181,7 +185,7 @@ STAGE PLANS: name: default.src1 Truncated Path -> Alias: /src1 [x] - Map 3 + Map 2 Map Operator Tree: TableScan alias: y @@ -191,14 +195,18 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 {value} + 2 {value} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + Position of Big Table: 2 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -248,7 +256,12 @@ STAGE PLANS: name: default.src Truncated Path -> Alias: /src [y] - Map 4 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 3 Map Operator Tree: TableScan alias: z @@ -258,14 +271,56 @@ STAGE PLANS: isSamplingPred: false predicate: key is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 2 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {key} + 1 {value} + 2 {value} + keys: + 0 key (type: string) + 1 key (type: string) + 2 key (type: string) + outputColumnNames: _col0, _col6, _col11 + input vertices: + 0 Map 1 + 1 Map 2 + Position of Big Table: 2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value,val2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.dest_j1 + serialization.ddl struct dest_j1 { string key, string value, string val2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -317,49 +372,6 @@ STAGE PLANS: name: default.srcpart Truncated Path -> Alias: /srcpart/ds=2008-04-08/hr=11 [z] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - Inner Join 0 to 2 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 {VALUE._col0} - 2 {VALUE._col0} - outputColumnNames: _col0, _col6, _col11 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - bucket_count -1 - columns key,value,val2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.dest_j1 - serialization.ddl struct dest_j1 { string key, string value, string val2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator @@ -657,15 +669,14 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-3 is a root stage + Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-3 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -678,14 +689,18 @@ STAGE PLANS: isSamplingPred: false predicate: UDFToDouble(key) is not null (type: boolean) Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: UDFToDouble(key) (type: double) - sort order: + - Map-reduce partition columns: UDFToDouble(key) (type: double) - Statistics: Num rows: 13 Data size: 99 Basic stats: COMPLETE Column stats: NONE - tag: 0 - value expressions: key (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 {value} + 2 {value} + keys: + 0 UDFToDouble(key) (type: double) + 1 UDFToDouble(key) (type: double) + 2 UDFToDouble(key) (type: double) + Position of Big Table: 2 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -735,7 +750,7 @@ STAGE PLANS: name: default.src1_copy Truncated Path -> Alias: /src1_copy [x] - Map 3 + Map 2 Map Operator Tree: TableScan alias: y @@ -745,14 +760,18 @@ STAGE PLANS: isSamplingPred: false predicate: UDFToDouble(key) is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: UDFToDouble(key) (type: double) - sort order: + - Map-reduce partition columns: UDFToDouble(key) (type: double) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 1 - value expressions: value (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 {value} + 2 {value} + keys: + 0 UDFToDouble(key) (type: double) + 1 UDFToDouble(key) (type: double) + 2 UDFToDouble(key) (type: double) + Position of Big Table: 2 + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -802,7 +821,12 @@ STAGE PLANS: name: default.src_copy Truncated Path -> Alias: /src_copy [y] - Map 4 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 3 Map Operator Tree: TableScan alias: z @@ -812,14 +836,61 @@ STAGE PLANS: isSamplingPred: false predicate: UDFToDouble(key) is not null (type: boolean) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: UDFToDouble(key) (type: double) - sort order: + - Map-reduce partition columns: UDFToDouble(key) (type: double) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE - tag: 2 - value expressions: value (type: string) - auto parallelism: false + Map Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {key} + 1 {value} + 2 {value} + keys: + 0 UDFToDouble(key) (type: double) + 1 UDFToDouble(key) (type: double) + 2 UDFToDouble(key) (type: double) + outputColumnNames: _col0, _col6, _col11 + input vertices: + 0 Map 1 + 1 Map 2 + Position of Big Table: 2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value,val2 + columns.comments + columns.types string:string:string +#### A masked pattern was here #### + name default.dest_j1 + numFiles 1 + numRows 107 + rawDataSize 2018 + serialization.ddl struct dest_j1 { string key, string value, string val2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 2125 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest_j1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -871,54 +942,6 @@ STAGE PLANS: name: default.srcpart Truncated Path -> Alias: /srcpart/ds=2008-04-08/hr=11 [z] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - Inner Join 0 to 2 - condition expressions: - 0 {VALUE._col0} - 1 {VALUE._col1} - 2 {VALUE._col1} - outputColumnNames: _col0, _col6, _col11 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col11 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 1 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value,val2 - columns.comments - columns.types string:string:string -#### A masked pattern was here #### - name default.dest_j1 - numFiles 1 - numRows 107 - rawDataSize 2018 - serialization.ddl struct dest_j1 { string key, string value, string val2} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 2125 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest_j1 - TotalFiles: 1 - GatherStats: true - MultiFileSpray: false Stage: Stage-0 Move Operator diff --git ql/src/test/results/clientpositive/spark/mapjoin1.q.out ql/src/test/results/clientpositive/spark/mapjoin1.q.out index 80e38b9..0e5907c 100644 --- ql/src/test/results/clientpositive/spark/mapjoin1.q.out +++ ql/src/test/results/clientpositive/spark/mapjoin1.q.out @@ -30,14 +30,13 @@ EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND true limit 10 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -45,50 +44,60 @@ STAGE PLANS: TableScan alias: a Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {key} {value} + filter predicates: + 0 + 1 {true} + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Right Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - filter predicates: - 0 - 1 {true} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + filter predicates: + 0 + 1 {true} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -104,16 +113,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -105 val_105 105 val_105 +238 val_238 238 val_238 +238 val_238 238 val_238 +86 val_86 86 val_86 +311 val_311 311 val_311 +311 val_311 311 val_311 +311 val_311 311 val_311 +27 val_27 27 val_27 +165 val_165 165 val_165 +165 val_165 165 val_165 +409 val_409 409 val_409 PREHOOK: query: -- func filter on outer join EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10 @@ -123,14 +132,13 @@ EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -138,50 +146,60 @@ STAGE PLANS: TableScan alias: a Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {key} {value} + filter predicates: + 0 + 1 {((key * 10) < '1000')} + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Right Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - filter predicates: - 0 - 1 {((KEY.reducesinkkey0 * 10) < '1000')} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + filter predicates: + 0 + 1 {((key * 10) < '1000')} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -197,16 +215,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -NULL NULL 105 val_105 +NULL NULL 238 val_238 +86 val_86 86 val_86 +NULL NULL 311 val_311 +27 val_27 27 val_27 +NULL NULL 165 val_165 +NULL NULL 409 val_409 +NULL NULL 255 val_255 +NULL NULL 278 val_278 +98 val_98 98 val_98 +98 val_98 98 val_98 PREHOOK: query: -- field filter on outer join EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN @@ -218,14 +236,35 @@ SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN (select key, named_struct('key', key, 'value', value) as kv from src) b on a.key=b.key AND b.kv.key > 200 limit 10 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {_col0} {_col1} + filter predicates: + 0 + 1 {(_col1.key > 200)} + keys: + 0 key (type: string) + 1 _col0 (type: string) + Local Work: + Map Reduce Local Work + Stage: Stage-1 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -237,50 +276,38 @@ STAGE PLANS: expressions: key (type: string), named_struct('key',key,'value',value) (type: struct) outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: struct) - Map 3 - Map Operator Tree: - TableScan - alias: a - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Right Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - filter predicates: - 0 - 1 {(VALUE._col0.key > 200)} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {_col0} {_col1} + filter predicates: + 0 + 1 {(_col1.key > 200)} + keys: + 0 key (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 0 Map 2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -298,16 +325,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -NULL NULL 0 {"key":"0","value":"val_0"} -NULL NULL 0 {"key":"0","value":"val_0"} -NULL NULL 0 {"key":"0","value":"val_0"} -NULL NULL 105 {"key":"105","value":"val_105"} -NULL NULL 111 {"key":"111","value":"val_111"} -NULL NULL 114 {"key":"114","value":"val_114"} -NULL NULL 12 {"key":"12","value":"val_12"} -NULL NULL 12 {"key":"12","value":"val_12"} -NULL NULL 120 {"key":"120","value":"val_120"} -NULL NULL 120 {"key":"120","value":"val_120"} +238 val_238 238 {"key":"238","value":"val_238"} +238 val_238 238 {"key":"238","value":"val_238"} +NULL NULL 86 {"key":"86","value":"val_86"} +311 val_311 311 {"key":"311","value":"val_311"} +311 val_311 311 {"key":"311","value":"val_311"} +311 val_311 311 {"key":"311","value":"val_311"} +NULL NULL 27 {"key":"27","value":"val_27"} +NULL NULL 165 {"key":"165","value":"val_165"} +409 val_409 409 {"key":"409","value":"val_409"} +409 val_409 409 {"key":"409","value":"val_409"} PREHOOK: query: EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND true limit 10 PREHOOK: type: QUERY @@ -315,14 +342,13 @@ POSTHOOK: query: EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND true limit 10 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -330,47 +356,54 @@ STAGE PLANS: TableScan alias: a Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {key} {value} + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Right Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -386,16 +419,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -105 val_105 105 val_105 +238 val_238 238 val_238 +238 val_238 238 val_238 +86 val_86 86 val_86 +311 val_311 311 val_311 +311 val_311 311 val_311 +311 val_311 311 val_311 +27 val_27 27 val_27 +165 val_165 165 val_165 +165 val_165 165 val_165 +409 val_409 409 val_409 PREHOOK: query: EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10 PREHOOK: type: QUERY @@ -403,14 +436,13 @@ POSTHOOK: query: EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on a.key=b.key AND b.key * 10 < '1000' limit 10 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -421,13 +453,21 @@ STAGE PLANS: Filter Operator predicate: ((key * 10) < '1000') (type: boolean) Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 3 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {key} {value} + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: b @@ -435,36 +475,35 @@ STAGE PLANS: Filter Operator predicate: ((key * 10) < '1000') (type: boolean) Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Right Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 0 Map 1 + Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 182 Data size: 1939 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -480,16 +519,16 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN src b on POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -0 val_0 0 val_0 -12 val_12 12 val_12 +86 val_86 86 val_86 +27 val_27 27 val_27 +98 val_98 98 val_98 +98 val_98 98 val_98 +66 val_66 66 val_66 +37 val_37 37 val_37 +37 val_37 37 val_37 +15 val_15 15 val_15 +15 val_15 15 val_15 +82 val_82 82 val_82 PREHOOK: query: EXPLAIN SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN (select key, named_struct('key', key, 'value', value) as kv from src) b on a.key=b.key AND b.kv.key > 200 limit 10 @@ -499,14 +538,32 @@ SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN (select key, named_struct('key', key, 'value', value) as kv from src) b on a.key=b.key AND b.kv.key > 200 limit 10 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 {_col0} {_col1} + keys: + 0 key (type: string) + 1 _col0 (type: string) + Local Work: + Map Reduce Local Work + Stage: Stage-1 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 3 (PARTITION-LEVEL SORT, 3) #### A masked pattern was here #### Vertices: Map 1 @@ -521,47 +578,35 @@ STAGE PLANS: Filter Operator predicate: (_col1.key > 200) (type: boolean) Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: struct) - Map 3 - Map Operator Tree: - TableScan - alias: a - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Reducer 2 - Reduce Operator Tree: - Join Operator - condition map: - Right Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} {VALUE._col0} - 1 {KEY.reducesinkkey0} {VALUE._col0} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {key} {value} + 1 {_col0} {_col1} + keys: + 0 key (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 0 Map 2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: struct) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -579,13 +624,13 @@ POSTHOOK: query: SELECT /*+ MAPJOIN(a) */ * FROM src a RIGHT OUTER JOIN POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -201 val_201 201 {"key":"201","value":"val_201"} -207 val_207 207 {"key":"207","value":"val_207"} -207 val_207 207 {"key":"207","value":"val_207"} -207 val_207 207 {"key":"207","value":"val_207"} -207 val_207 207 {"key":"207","value":"val_207"} -213 val_213 213 {"key":"213","value":"val_213"} -213 val_213 213 {"key":"213","value":"val_213"} -213 val_213 213 {"key":"213","value":"val_213"} -213 val_213 213 {"key":"213","value":"val_213"} -216 val_216 216 {"key":"216","value":"val_216"} +238 val_238 238 {"key":"238","value":"val_238"} +238 val_238 238 {"key":"238","value":"val_238"} +311 val_311 311 {"key":"311","value":"val_311"} +311 val_311 311 {"key":"311","value":"val_311"} +311 val_311 311 {"key":"311","value":"val_311"} +409 val_409 409 {"key":"409","value":"val_409"} +409 val_409 409 {"key":"409","value":"val_409"} +409 val_409 409 {"key":"409","value":"val_409"} +255 val_255 255 {"key":"255","value":"val_255"} +255 val_255 255 {"key":"255","value":"val_255"} diff --git ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out index dc7241c..fcc8d6a 100644 --- ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out +++ ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out @@ -11,67 +11,75 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08') SELECT /*+ MAPJOIN(d) */ DISTINCT c.value POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 5 (PARTITION-LEVEL SORT, 1) - Reducer 3 <- Reducer 2 (GROUP PARTITION-LEVEL SORT, 1) - Reducer 4 <- Reducer 3 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan - alias: c + alias: d Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 5 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1) + Reducer 3 <- Reducer 2 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: d + alias: c Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col1 + input vertices: + 1 Map 4 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: rand() (type: double) + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Local Work: + Map Reduce Local Work Reducer 2 Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {VALUE._col0} - 1 - outputColumnNames: _col1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Group By Operator - keys: _col1 (type: string) - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: rand() (type: double) - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reducer 3 - Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string) mode: partials @@ -82,7 +90,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string) @@ -148,66 +156,74 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08') SELECT /*+ MAPJOIN(d) */ DISTINCT c.value POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 3) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: c + alias: d Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 4 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 3) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: d + alias: c Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col1 + input vertices: + 1 Map 3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Local Work: + Map Reduce Local Work Reducer 2 Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {VALUE._col0} - 1 - outputColumnNames: _col1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Group By Operator - keys: _col1 (type: string) - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reducer 3 - Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string) mode: mergepartial @@ -272,62 +288,70 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08') SELECT /*+ MAPJOIN(d) */ DISTINCT c.value POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 5 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP PARTITION-LEVEL SORT, 3) - Reducer 4 <- Reducer 3 (GROUP, 3) #### A masked pattern was here #### Vertices: - Map 1 + Map 4 Map Operator Tree: TableScan - alias: c + alias: d Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 5 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 3) + Reducer 3 <- Reducer 2 (GROUP, 3) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: d + alias: c Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col1 + input vertices: + 1 Map 4 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Map-reduce partition columns: rand() (type: double) + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Local Work: + Map Reduce Local Work Reducer 2 Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {VALUE._col0} - 1 - outputColumnNames: _col1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col1 (type: string) - sort order: + - Map-reduce partition columns: rand() (type: double) - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reducer 3 - Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string) mode: partial1 @@ -338,7 +362,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reducer 4 + Reducer 3 Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string) @@ -404,61 +428,69 @@ ON ( c.key=d.key AND c.ds='2008-04-08' AND d.ds='2008-04-08') SELECT /*+ MAPJOIN(d) */ DISTINCT c.value POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 3) #### A masked pattern was here #### Vertices: - Map 1 + Map 3 Map Operator Tree: TableScan - alias: c + alias: d Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: value (type: string) - Map 4 + Spark HashTable Sink Operator + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 3) +#### A masked pattern was here #### + Vertices: + Map 1 Map Operator Tree: TableScan - alias: d + alias: c Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {value} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col1 + input vertices: + 1 Map 3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Map-reduce partition columns: _col1 (type: string) + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Local Work: + Map Reduce Local Work Reducer 2 Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {VALUE._col0} - 1 - outputColumnNames: _col1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col1 (type: string) - sort order: + - Map-reduce partition columns: _col1 (type: string) - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - Reducer 3 - Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string) mode: complete diff --git ql/src/test/results/clientpositive/spark/semijoin.q.out ql/src/test/results/clientpositive/spark/semijoin.q.out index 2b8e04b..8168832 100644 --- ql/src/test/results/clientpositive/spark/semijoin.q.out +++ ql/src/test/results/clientpositive/spark/semijoin.q.out @@ -931,31 +931,16 @@ PREHOOK: type: QUERY POSTHOOK: query: explain select /*+ mapjoin(b) */ a.key from t3 a left semi join t1 b on a.key = b.key sort by a.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (SORT, 3) #### A masked pattern was here #### Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: a - Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE - Filter Operator - predicate: key is not null (type: boolean) - Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE - Map 4 + Map 3 Map Operator Tree: TableScan alias: b @@ -972,27 +957,51 @@ STAGE PLANS: mode: hash outputColumnNames: _col0 Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 + keys: + 0 key (type: int) + 1 _col0 (type: int) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (SORT, 3) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {key} + 1 + keys: + 0 key (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + input vertices: + 1 Map 3 + Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE + Local Work: + Map Reduce Local Work Reducer 2 Reduce Operator Tree: - Join Operator - condition map: - Left Semi Join 0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 - outputColumnNames: _col0 - Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Statistics: Num rows: 12 Data size: 89 Basic stats: COMPLETE Column stats: NONE - Reducer 3 - Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: int) outputColumnNames: _col0 @@ -1387,31 +1396,16 @@ PREHOOK: type: QUERY POSTHOOK: query: explain select /*+ mapjoin(b, c) */ a.key from t3 a left semi join t1 b on a.key = b.key left semi join t2 c on a.key = c.key sort by a.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3), Map 5 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (SORT, 3) #### A masked pattern was here #### Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: a - Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE - Filter Operator - predicate: key is not null (type: boolean) - Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: key (type: int) - sort order: + - Map-reduce partition columns: key (type: int) - Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE - Map 4 + Map 3 Map Operator Tree: TableScan alias: b @@ -1428,12 +1422,18 @@ STAGE PLANS: mode: hash outputColumnNames: _col0 Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 6 Data size: 43 Basic stats: COMPLETE Column stats: NONE - Map 5 + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 + 2 + keys: + 0 key (type: int) + 1 _col0 (type: int) + 2 _col0 (type: int) + Local Work: + Map Reduce Local Work + Map 4 Map Operator Tree: TableScan alias: c @@ -1450,29 +1450,57 @@ STAGE PLANS: mode: hash outputColumnNames: _col0 Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 + 2 + keys: + 0 key (type: int) + 1 _col0 (type: int) + 2 _col0 (type: int) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (SORT, 3) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 22 Data size: 163 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 11 Data size: 81 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Left Semi Join 0 to 1 + Left Semi Join 0 to 2 + condition expressions: + 0 {key} + 1 + 2 + keys: + 0 key (type: int) + 1 _col0 (type: int) + 2 _col0 (type: int) + outputColumnNames: _col0 + input vertices: + 1 Map 3 + 2 Map 4 + Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE + Local Work: + Map Reduce Local Work Reducer 2 Reduce Operator Tree: - Join Operator - condition map: - Left Semi Join 0 to 1 - Left Semi Join 0 to 2 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 - 2 - outputColumnNames: _col0 - Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Statistics: Num rows: 24 Data size: 178 Basic stats: COMPLETE Column stats: NONE - Reducer 3 - Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: int) outputColumnNames: _col0 diff --git ql/src/test/results/clientpositive/spark/skewjoin.q.out ql/src/test/results/clientpositive/spark/skewjoin.q.out index 56b78be..c944de5 100644 --- ql/src/test/results/clientpositive/spark/skewjoin.q.out +++ ql/src/test/results/clientpositive/spark/skewjoin.q.out @@ -949,15 +949,34 @@ POSTHOOK: query: EXPLAIN SELECT /*+ mapjoin(v)*/ sum(hash(k.key)), sum(hash(v.val)) FROM T1 k LEFT OUTER JOIN T1 v ON k.key+1=v.key POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 3 + Map Operator Tree: + TableScan + alias: v + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Spark HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + keys: + 0 (key + 1) (type: double) + 1 UDFToDouble(key) (type: double) + Local Work: + Map Reduce Local Work + Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 3), Map 4 (PARTITION-LEVEL SORT, 3) - Reducer 3 <- Reducer 2 (GROUP, 1) + Reducer 2 <- Map 1 (GROUP, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -965,44 +984,32 @@ STAGE PLANS: TableScan alias: k Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE - Reduce Output Operator - key expressions: (key + 1) (type: double) - sort order: + - Map-reduce partition columns: (key + 1) (type: double) - Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE - value expressions: key (type: string) - Map 4 - Map Operator Tree: - TableScan - alias: v - Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE - Reduce Output Operator - key expressions: UDFToDouble(key) (type: double) - sort order: + - Map-reduce partition columns: UDFToDouble(key) (type: double) - Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE - value expressions: val (type: string) + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {key} + 1 {val} + keys: + 0 (key + 1) (type: double) + 1 UDFToDouble(key) (type: double) + outputColumnNames: _col0, _col6 + input vertices: + 1 Map 3 + Statistics: Num rows: 0 Data size: 33 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: sum(hash(_col0)), sum(hash(_col6)) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint) + Local Work: + Map Reduce Local Work Reducer 2 Reduce Operator Tree: - Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 {VALUE._col0} - 1 {VALUE._col1} - outputColumnNames: _col0, _col6 - Statistics: Num rows: 0 Data size: 33 Basic stats: PARTIAL Column stats: NONE - Group By Operator - aggregations: sum(hash(_col0)), sum(hash(_col6)) - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: bigint), _col1 (type: bigint) - Reducer 3 - Reduce Operator Tree: Group By Operator aggregations: sum(VALUE._col0), sum(VALUE._col1) mode: mergepartial