diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index 38380fb..d84a42c 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -655,6 +655,7 @@ spark.query.files=add_part_multiple.q, \ join_cond_pushdown_unqual2.q, \ join_cond_pushdown_unqual3.q, \ join_cond_pushdown_unqual4.q, \ + join_empty.q \ join_filters.q, \ join_filters_overlap.q, \ join_hive_626.q, \ diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SparkMapJoinResolver.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SparkMapJoinResolver.java index 670cdd3..4054173 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SparkMapJoinResolver.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SparkMapJoinResolver.java @@ -21,7 +21,9 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -42,6 +44,7 @@ import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SparkWork; @@ -63,7 +66,24 @@ public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { // Check whether the specified BaseWork's operator tree contains a operator // of the specified operator class private boolean containsOp(BaseWork work, Class clazz) { - for (Operator op : work.getAllOperators()) { + Set> ops = new HashSet>(); + if (work instanceof MapWork) { + Collection> opSet = ((MapWork) work).getAliasToWork().values(); + Stack> opStack = new Stack>(); + opStack.addAll(opSet); + + while (!opStack.empty()) { + Operator op = opStack.pop(); + ops.add(op); + if (op.getChildOperators() != null) { + opStack.addAll(op.getChildOperators()); + } + } + } else { + ops.addAll(work.getAllOperators()); + } + + for (Operator op : ops) { if (clazz.isInstance(op)) { return true; } diff --git ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out index 9f5d625..6b662b2 100644 --- ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out +++ ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out @@ -91,18 +91,26 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Map 2 <- Map 1 (NONE, 0) #### A masked pattern was here #### Vertices: Map 1 + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: Map 2 + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -191,18 +199,26 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Map 2 <- Map 1 (NONE, 0) #### A masked pattern was here #### Vertices: Map 1 + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: Map 2 + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator diff --git ql/src/test/results/clientpositive/spark/join_view.q.out ql/src/test/results/clientpositive/spark/join_view.q.out index 9915f33..2d94c12 100644 --- ql/src/test/results/clientpositive/spark/join_view.q.out +++ ql/src/test/results/clientpositive/spark/join_view.q.out @@ -43,18 +43,26 @@ PREHOOK: type: QUERY POSTHOOK: query: explain select * from v where ds='2011-09-01' POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Map 2 <- Map 1 (NONE, 0) #### A masked pattern was here #### Vertices: Map 1 + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: Map 2 + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator diff --git ql/src/test/results/clientpositive/spark/optimize_nullscan.q.out ql/src/test/results/clientpositive/spark/optimize_nullscan.q.out index 1993290..1cd3952 100644 --- ql/src/test/results/clientpositive/spark/optimize_nullscan.q.out +++ ql/src/test/results/clientpositive/spark/optimize_nullscan.q.out @@ -214,88 +214,18 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark Edges: - Reducer 4 <- Map 3 (GROUP, 1) - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Reducer 4 (PARTITION-LEVEL SORT, 1) + Reducer 3 <- Map 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: src - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - GatherStats: false - Filter Operator - isSamplingPred: false - predicate: false (type: boolean) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Select Operator - expressions: key (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 0 - auto parallelism: false - Path -> Alias: - -mr-10003default.src{} [src] - Path -> Partition: - -mr-10003default.src{} - Partition - base file name: src - input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value - columns.comments defaultdefault - columns.types string:string -#### A masked pattern was here #### - name default.src - numFiles 1 - numRows 500 - rawDataSize 5312 - serialization.ddl struct src { string key, string value} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe - totalSize 5812 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.NullStructSerDe - - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value - columns.comments defaultdefault - columns.types string:string -#### A masked pattern was here #### - name default.src - numFiles 1 - numRows 500 - rawDataSize 5312 - serialization.ddl struct src { string key, string value} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.src - name: default.src - Truncated Path -> Alias: - -mr-10003default.src{} [src] - Map 3 + Map 2 Map Operator Tree: TableScan alias: srcpart @@ -315,12 +245,12 @@ STAGE PLANS: value expressions: _col0 (type: string) auto parallelism: false Path -> Alias: - -mr-10004default.srcpart{ds=2008-04-08, hr=11} [srcpart] - -mr-10005default.srcpart{ds=2008-04-08, hr=12} [srcpart] - -mr-10006default.srcpart{ds=2008-04-09, hr=11} [srcpart] - -mr-10007default.srcpart{ds=2008-04-09, hr=12} [srcpart] + -mr-10005default.srcpart{ds=2008-04-08, hr=11} [srcpart] + -mr-10006default.srcpart{ds=2008-04-08, hr=12} [srcpart] + -mr-10007default.srcpart{ds=2008-04-09, hr=11} [srcpart] + -mr-10008default.srcpart{ds=2008-04-09, hr=12} [srcpart] Path -> Partition: - -mr-10004default.srcpart{ds=2008-04-08, hr=11} + -mr-10005default.srcpart{ds=2008-04-08, hr=11} Partition base file name: hr=11 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -366,7 +296,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart - -mr-10005default.srcpart{ds=2008-04-08, hr=12} + -mr-10006default.srcpart{ds=2008-04-08, hr=12} Partition base file name: hr=12 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -412,7 +342,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart - -mr-10006default.srcpart{ds=2008-04-09, hr=11} + -mr-10007default.srcpart{ds=2008-04-09, hr=11} Partition base file name: hr=11 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -458,7 +388,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart - -mr-10007default.srcpart{ds=2008-04-09, hr=12} + -mr-10008default.srcpart{ds=2008-04-09, hr=12} Partition base file name: hr=12 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -505,47 +435,13 @@ STAGE PLANS: name: default.srcpart name: default.srcpart Truncated Path -> Alias: - -mr-10004default.srcpart{ds=2008-04-08, hr=11} [srcpart] - -mr-10005default.srcpart{ds=2008-04-08, hr=12} [srcpart] - -mr-10006default.srcpart{ds=2008-04-09, hr=11} [srcpart] - -mr-10007default.srcpart{ds=2008-04-09, hr=12} [srcpart] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 {KEY.reducesinkkey0} - outputColumnNames: _col0, _col1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1 - columns.types string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false - Reducer 4 + -mr-10005default.srcpart{ds=2008-04-08, hr=11} [srcpart] + -mr-10006default.srcpart{ds=2008-04-08, hr=12} [srcpart] + -mr-10007default.srcpart{ds=2008-04-09, hr=11} [srcpart] + -mr-10008default.srcpart{ds=2008-04-09, hr=12} [srcpart] + Reducer 3 + Local Work: + Map Reduce Local Work Needs Tagging: false Reduce Operator Tree: Select Operator @@ -555,13 +451,123 @@ STAGE PLANS: Limit Number of rows: 0 Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Spark HashTable Sink Operator + condition expressions: + 0 {_col0} + 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Position of Big Table: 0 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: false (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 1 - auto parallelism: false + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {_col0} + 1 {_col0} + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1 + input vertices: + 1 Reducer 3 + Position of Big Table: 0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Path -> Alias: + -mr-10004default.src{} [src] + Path -> Partition: + -mr-10004default.src{} + Partition + base file name: src + input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments defaultdefault + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.NullStructSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments defaultdefault + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + -mr-10004default.src{} [src] Stage: Stage-0 Fetch Operator @@ -1034,8 +1040,8 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 #### A masked pattern was here #### -0 2000 +0 PREHOOK: query: explain extended select * from (select key from src where false) a left outer join (select value from srcpart limit 0) b PREHOOK: type: QUERY @@ -1091,87 +1097,18 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark Edges: - Reducer 4 <- Map 3 (GROUP, 1) - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Reducer 4 (PARTITION-LEVEL SORT, 1) + Reducer 3 <- Map 2 (GROUP, 1) #### A masked pattern was here #### Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: src - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - GatherStats: false - Filter Operator - isSamplingPred: false - predicate: false (type: boolean) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Select Operator - expressions: key (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 0 - value expressions: _col0 (type: string) - auto parallelism: false - Path -> Alias: - -mr-10003default.src{} [src] - Path -> Partition: - -mr-10003default.src{} - Partition - base file name: src - input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value - columns.comments defaultdefault - columns.types string:string -#### A masked pattern was here #### - name default.src - numFiles 1 - numRows 500 - rawDataSize 5312 - serialization.ddl struct src { string key, string value} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe - totalSize 5812 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.NullStructSerDe - - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - COLUMN_STATS_ACCURATE true - bucket_count -1 - columns key,value - columns.comments defaultdefault - columns.types string:string -#### A masked pattern was here #### - name default.src - numFiles 1 - numRows 500 - rawDataSize 5312 - serialization.ddl struct src { string key, string value} - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - totalSize 5812 -#### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.src - name: default.src - Truncated Path -> Alias: - -mr-10003default.src{} [src] - Map 3 + Map 2 Map Operator Tree: TableScan alias: srcpart @@ -1191,12 +1128,12 @@ STAGE PLANS: value expressions: _col0 (type: string) auto parallelism: false Path -> Alias: - -mr-10004default.srcpart{ds=2008-04-08, hr=11} [srcpart] - -mr-10005default.srcpart{ds=2008-04-08, hr=12} [srcpart] - -mr-10006default.srcpart{ds=2008-04-09, hr=11} [srcpart] - -mr-10007default.srcpart{ds=2008-04-09, hr=12} [srcpart] + -mr-10005default.srcpart{ds=2008-04-08, hr=11} [srcpart] + -mr-10006default.srcpart{ds=2008-04-08, hr=12} [srcpart] + -mr-10007default.srcpart{ds=2008-04-09, hr=11} [srcpart] + -mr-10008default.srcpart{ds=2008-04-09, hr=12} [srcpart] Path -> Partition: - -mr-10004default.srcpart{ds=2008-04-08, hr=11} + -mr-10005default.srcpart{ds=2008-04-08, hr=11} Partition base file name: hr=11 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -1242,7 +1179,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart - -mr-10005default.srcpart{ds=2008-04-08, hr=12} + -mr-10006default.srcpart{ds=2008-04-08, hr=12} Partition base file name: hr=12 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -1288,7 +1225,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart - -mr-10006default.srcpart{ds=2008-04-09, hr=11} + -mr-10007default.srcpart{ds=2008-04-09, hr=11} Partition base file name: hr=11 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -1334,7 +1271,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart - -mr-10007default.srcpart{ds=2008-04-09, hr=12} + -mr-10008default.srcpart{ds=2008-04-09, hr=12} Partition base file name: hr=12 input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -1381,47 +1318,13 @@ STAGE PLANS: name: default.srcpart name: default.srcpart Truncated Path -> Alias: - -mr-10004default.srcpart{ds=2008-04-08, hr=11} [srcpart] - -mr-10005default.srcpart{ds=2008-04-08, hr=12} [srcpart] - -mr-10006default.srcpart{ds=2008-04-09, hr=11} [srcpart] - -mr-10007default.srcpart{ds=2008-04-09, hr=12} [srcpart] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 {VALUE._col0} - 1 {VALUE._col0} - outputColumnNames: _col0, _col1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1 - columns.types string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false - Reducer 4 + -mr-10005default.srcpart{ds=2008-04-08, hr=11} [srcpart] + -mr-10006default.srcpart{ds=2008-04-08, hr=12} [srcpart] + -mr-10007default.srcpart{ds=2008-04-09, hr=11} [srcpart] + -mr-10008default.srcpart{ds=2008-04-09, hr=12} [srcpart] + Reducer 3 + Local Work: + Map Reduce Local Work Needs Tagging: false Reduce Operator Tree: Select Operator @@ -1431,12 +1334,123 @@ STAGE PLANS: Limit Number of rows: 0 Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - sort order: + Spark HashTable Sink Operator + condition expressions: + 0 {_col0} + 1 {_col0} + keys: + 0 + 1 + Position of Big Table: 0 + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: false (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 1 - value expressions: _col0 (type: string) - auto parallelism: false + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {_col0} + 1 {_col0} + keys: + 0 + 1 + outputColumnNames: _col0, _col1 + input vertices: + 1 Reducer 3 + Position of Big Table: 0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work + Path -> Alias: + -mr-10004default.src{} [src] + Path -> Partition: + -mr-10004default.src{} + Partition + base file name: src + input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments defaultdefault + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.NullStructSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count -1 + columns key,value + columns.comments defaultdefault + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + -mr-10004default.src{} [src] Stage: Stage-0 Fetch Operator @@ -1528,19 +1542,26 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Local Work: + Map Reduce Local Work + Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 5 (PARTITION-LEVEL SORT, 1) - Union 3 <- Map 4 (NONE, 0), Reducer 2 (NONE, 0) + Union 3 <- Map 2 (NONE, 0), Map 4 (NONE, 0) #### A masked pattern was here #### Vertices: - Map 1 - Map 4 + Map 2 Map Operator Tree: TableScan alias: src @@ -1623,30 +1644,62 @@ STAGE PLANS: name: default.src Truncated Path -> Alias: /src [src] - Map 5 + Map 4 Map Operator Tree: TableScan alias: src - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: false (type: boolean) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: key (type: string) - sort order: + - Map-reduce partition columns: key (type: string) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 0 - auto parallelism: false + Map Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {key} + 1 + keys: + 0 key (type: string) + 1 key (type: string) + outputColumnNames: _col0 + input vertices: + 1 Map 1 + Position of Big Table: 0 + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: - -mr-10003default.src{} [src] +#### A masked pattern was here #### Path -> Partition: - -mr-10003default.src{} +#### A masked pattern was here #### Partition base file name: src - input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat + input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE true @@ -1661,10 +1714,10 @@ STAGE PLANS: rawDataSize 5312 serialization.ddl struct src { string key, string value} serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 #### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.NullStructSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -1688,43 +1741,7 @@ STAGE PLANS: name: default.src name: default.src Truncated Path -> Alias: - -mr-10003default.src{} [src] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Left Outer Join0 to 1 - condition expressions: - 0 {KEY.reducesinkkey0} - 1 - outputColumnNames: _col0 - Select Operator - expressions: _col0 (type: string) - outputColumnNames: _col0 - Select Operator - expressions: _col0 (type: string) - outputColumnNames: _col0 - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0 - columns.types string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false + /src [src] Union 3 Vertex: Union 3 @@ -1785,14 +1802,13 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 3 (PARTITION-LEVEL SORT, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -1805,18 +1821,20 @@ STAGE PLANS: isSamplingPred: false predicate: false (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: value (type: string) - sort order: + - Map-reduce partition columns: value (type: string) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 1 - value expressions: key (type: string) - auto parallelism: false + Spark HashTable Sink Operator + condition expressions: + 0 {key} {value} + 1 {key} + keys: + 0 value (type: string) + 1 value (type: string) + Position of Big Table: 0 + Local Work: + Map Reduce Local Work Path -> Alias: - -mr-10003default.src{} [s2] + -mr-10005default.src{} [s2] Path -> Partition: - -mr-10003default.src{} + -mr-10005default.src{} Partition base file name: src input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat @@ -1861,8 +1879,13 @@ STAGE PLANS: name: default.src name: default.src Truncated Path -> Alias: - -mr-10003default.src{} [s2] - Map 3 + -mr-10005default.src{} [s2] + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 Map Operator Tree: TableScan alias: s1 @@ -1872,14 +1895,47 @@ STAGE PLANS: isSamplingPred: false predicate: false (type: boolean) Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Reduce Output Operator - key expressions: value (type: string) - sort order: + - Map-reduce partition columns: value (type: string) + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} {value} + 1 {key} {value} + keys: + 0 value (type: string) + 1 value (type: string) + outputColumnNames: _col0, _col1, _col5, _col6 + input vertices: + 1 Map 1 + Position of Big Table: 0 Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - tag: 0 - value expressions: key (type: string) - auto parallelism: false + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:string:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Local Work: + Map Reduce Local Work Path -> Alias: -mr-10004default.src{} [s1] Path -> Partition: @@ -1929,42 +1985,6 @@ STAGE PLANS: name: default.src Truncated Path -> Alias: -mr-10004default.src{} [s1] - Reducer 2 - Needs Tagging: true - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - condition expressions: - 0 {VALUE._col0} {KEY.reducesinkkey0} - 1 {VALUE._col0} {KEY.reducesinkkey0} - outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - Select Operator - expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE - File Output Operator - compressed: false - GlobalTableId: 0 -#### A masked pattern was here #### - NumFilesPerFileSink: 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE -#### A masked pattern was here #### - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - properties: - columns _col0,_col1,_col2,_col3 - columns.types string:string:string:string - escape.delim \ - hive.serialization.extend.nesting.levels true - serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - TotalFiles: 1 - GatherStats: false - MultiFileSpray: false Stage: Stage-0 Fetch Operator diff --git ql/src/test/results/clientpositive/spark/smb_mapjoin9.q.out ql/src/test/results/clientpositive/spark/smb_mapjoin9.q.out index e1614d9..a856ec9 100644 --- ql/src/test/results/clientpositive/spark/smb_mapjoin9.q.out +++ ql/src/test/results/clientpositive/spark/smb_mapjoin9.q.out @@ -106,18 +106,26 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Map 2 <- Map 1 (NONE, 0) #### A masked pattern was here #### Vertices: Map 1 + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: Map 2 + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator @@ -231,18 +239,26 @@ TOK_QUERY STAGE DEPENDENCIES: - Stage-1 is a root stage + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 Stage-0 depends on stages: Stage-1 STAGE PLANS: - Stage: Stage-1 + Stage: Stage-2 Spark - Edges: - Map 2 <- Map 1 (NONE, 0) #### A masked pattern was here #### Vertices: Map 1 + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: Map 2 + Local Work: + Map Reduce Local Work Stage: Stage-0 Fetch Operator