diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java index 6bef5f5..69f3c5b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java @@ -483,6 +483,11 @@ private boolean checkColEquality(List> grandParentColNames, int colCount = 0; // parent op is guaranteed to have a single list because it is a reduce sink for (String colName : parentColNames.get(0)) { + if (listBucketCols.size() <= colCount) { + // can happen with virtual columns. RS would add the column to its output columns + // but it would not exist in the grandparent output columns or exprMap. + return false; + } // all columns need to be at least a subset of the parentOfParent's bucket cols ExprNodeDesc exprNodeDesc = colExprMap.get(colName); if (exprNodeDesc instanceof ExprNodeColumnDesc) { diff --git ql/src/test/queries/clientpositive/bucket_map_join_tez1.q ql/src/test/queries/clientpositive/bucket_map_join_tez1.q index c9266a5..42e26a8 100644 --- ql/src/test/queries/clientpositive/bucket_map_join_tez1.q +++ ql/src/test/queries/clientpositive/bucket_map_join_tez1.q @@ -82,4 +82,6 @@ from tab1 a join tab_part b on a.key = b.key; explain select a.key, b.key from tab_part a join tab_part c on a.key = c.key join tab_part b on a.value = b.value; - +explain +select a.key, a.value, b.value +from tab a join tab_part b on a.key = b.key and a.ds = b.ds; diff --git ql/src/test/results/clientpositive/spark/bucket_map_join_tez1.q.out ql/src/test/results/clientpositive/spark/bucket_map_join_tez1.q.out index 1c81d1b..65bded2 100644 --- ql/src/test/results/clientpositive/spark/bucket_map_join_tez1.q.out +++ ql/src/test/results/clientpositive/spark/bucket_map_join_tez1.q.out @@ -1218,3 +1218,78 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: explain +select a.key, a.value, b.value +from tab a join tab_part b on a.key = b.key and a.ds = b.ds +PREHOOK: type: QUERY +POSTHOOK: query: explain +select a.key, a.value, b.value +from tab a join tab_part b on a.key = b.key and a.ds = b.ds +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 242 Data size: 2566 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 121 Data size: 1283 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 key (type: int), ds (type: string) + 1 key (type: int), ds (type: string) + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 key (type: int), ds (type: string) + 1 key (type: int), ds (type: string) + outputColumnNames: _col0, _col1, _col7 + input vertices: + 0 Map 1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + diff --git ql/src/test/results/clientpositive/tez/bucket_map_join_tez1.q.out ql/src/test/results/clientpositive/tez/bucket_map_join_tez1.q.out index 5566fd3..8ff9f7c 100644 --- ql/src/test/results/clientpositive/tez/bucket_map_join_tez1.q.out +++ ql/src/test/results/clientpositive/tez/bucket_map_join_tez1.q.out @@ -1127,3 +1127,87 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: explain +select a.key, a.value, b.value +from tab a join tab_part b on a.key = b.key and a.ds = b.ds +PREHOOK: type: QUERY +POSTHOOK: query: explain +select a.key, a.value, b.value +from tab a join tab_part b on a.key = b.key and a.ds = b.ds +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Map 2 <- Map 1 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 242 Data size: 2566 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 121 Data size: 1283 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: key (type: int), ds (type: string) + sort order: ++ + Map-reduce partition columns: key (type: int), ds (type: string) + Statistics: Num rows: 121 Data size: 1283 Basic stats: COMPLETE Column stats: NONE + value expressions: value (type: string) + Select Operator + expressions: ds (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 121 Data size: 1283 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 121 Data size: 1283 Basic stats: COMPLETE Column stats: NONE + Dynamic Partitioning Event Operator + Target Input: b + Partition key expr: ds + Statistics: Num rows: 121 Data size: 1283 Basic stats: COMPLETE Column stats: NONE + Target column: ds + Target Vertex: Map 2 + Map 2 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 key (type: int), ds (type: string) + 1 key (type: int), ds (type: string) + outputColumnNames: _col0, _col1, _col7 + input vertices: + 0 Map 1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col7 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +