diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java index 89db530f54..8f751260a0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java @@ -110,6 +110,19 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, for (List cols : parentOpTraits.getBucketColNames()) { for (String col : cols) { for (Entry entry : rs.getColumnExprMap().entrySet()) { + // Make sure this entry is in key columns. + boolean isKey = false; + for (ExprNodeDesc keyDesc : rs.getConf().getKeyCols()) { + if (keyDesc.isSame(entry.getValue())) { + isKey = true; + break; + } + } + + // skip if not a key + if (!isKey) { + continue; + } // Fetch the column expression. There should be atleast one. Map colMap = new HashMap<>(); boolean found = false; diff --git a/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q b/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q index adcf6962ab..ae1ec441ee 100644 --- a/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q +++ b/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q @@ -138,3 +138,22 @@ explain select a.key, b.key from tab_part_n11 a join tab_part_n11 c on a.key = c set test.comment=External tables, bucket map join should be disabled; set test.comment; explain select a.key, b.key from tab_part_ext a join tab_part_ext c on a.key = c.key join tab_part_ext b on a.value = b.value; + +-- HIVE-20187 : Must not create BMJ +create table my_fact(AMT decimal(20,3),bucket_col string ,join_col string ) +PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string ) +CLUSTERED BY (bucket_col) INTO 10 +BUCKETS +stored as ORC +; +create table my_dim(join_col string,filter_col string) stored as orc; + +INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", "VAL3"), ("4", "VAL4"); +INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, "20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3"); + +explain extended +select bucket_col, my_dim.join_col as account1,my_fact.accounting_period +FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col +WHERE my_fact.fiscal_year = '2015' +AND my_dim.filter_col IN ( 'VAL1', 'VAL2' ) +and my_fact.accounting_period in (10); \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out index 4f042cee50..bf64a15072 100644 --- a/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out +++ b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out @@ -2180,3 +2180,215 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: create table my_fact(AMT decimal(20,3),bucket_col string ,join_col string ) +PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string ) +CLUSTERED BY (bucket_col) INTO 10 +BUCKETS +stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@my_fact +POSTHOOK: query: create table my_fact(AMT decimal(20,3),bucket_col string ,join_col string ) +PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string ) +CLUSTERED BY (bucket_col) INTO 10 +BUCKETS +stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@my_fact +PREHOOK: query: create table my_dim(join_col string,filter_col string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@my_dim +POSTHOOK: query: create table my_dim(join_col string,filter_col string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@my_dim +PREHOOK: query: INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", "VAL3"), ("4", "VAL4") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@my_dim +POSTHOOK: query: INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", "VAL3"), ("4", "VAL4") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@my_dim +POSTHOOK: Lineage: my_dim.filter_col SCRIPT [] +POSTHOOK: Lineage: my_dim.join_col SCRIPT [] +PREHOOK: query: INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, "20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@my_fact@fiscal_year=2015/accounting_period=20 +POSTHOOK: query: INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, "20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@my_fact@fiscal_year=2015/accounting_period=20 +POSTHOOK: Lineage: my_fact PARTITION(fiscal_year=2015,accounting_period=20).amt SCRIPT [] +POSTHOOK: Lineage: my_fact PARTITION(fiscal_year=2015,accounting_period=20).bucket_col SCRIPT [] +POSTHOOK: Lineage: my_fact PARTITION(fiscal_year=2015,accounting_period=20).join_col SCRIPT [] +PREHOOK: query: explain extended +select bucket_col, my_dim.join_col as account1,my_fact.accounting_period +FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col +WHERE my_fact.fiscal_year = '2015' +AND my_dim.filter_col IN ( 'VAL1', 'VAL2' ) +and my_fact.accounting_period in (10) +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select bucket_col, my_dim.join_col as account1,my_fact.accounting_period +FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col +WHERE my_fact.fiscal_year = '2015' +AND my_dim.filter_col IN ( 'VAL1', 'VAL2' ) +and my_fact.accounting_period in (10) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Map 1 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: my_fact + filterExpr: ((fiscal_year = '2015') and (UDFToDouble(accounting_period) = 10.0D) and join_col is not null) (type: boolean) + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: PARTIAL + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: ((UDFToDouble(accounting_period) = 10.0D) and (fiscal_year = '2015') and join_col is not null) (type: boolean) + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: bucket_col (type: string), join_col (type: string), accounting_period (type: string) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 1 Data size: 640 Basic stats: COMPLETE Column stats: PARTIAL + Reduce Output Operator + key expressions: _col1 (type: string) + null sort order: a + sort order: + + Map-reduce partition columns: _col1 (type: string) + Statistics: Num rows: 1 Data size: 640 Basic stats: COMPLETE Column stats: PARTIAL + tag: 0 + value expressions: _col0 (type: string), _col3 (type: string) + auto parallelism: true + Execution mode: vectorized, llap + LLAP IO: unknown + Map 2 + Map Operator Tree: + TableScan + alias: my_dim + filterExpr: ((filter_col) IN ('VAL1', 'VAL2') and join_col is not null) (type: boolean) + Statistics: Num rows: 4 Data size: 1472 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: ((filter_col) IN ('VAL1', 'VAL2') and join_col is not null) (type: boolean) + Statistics: Num rows: 4 Data size: 1472 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: join_col (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 4 Data size: 1472 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + Estimated key counts: Map 1 => 1 + keys: + 0 _col1 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col3, _col4 + input vertices: + 0 Map 1 + Position of Big Table: 1 + Statistics: Num rows: 4 Data size: 1619 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col4 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 1619 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 4 Data size: 1619 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types string:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Execution mode: vectorized, llap + LLAP IO: all inputs + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: my_dim + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns join_col,filter_col + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.my_dim + numFiles 1 + numRows 4 + rawDataSize 692 + serialization.ddl struct my_dim { string join_col, string filter_col} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 338 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns join_col,filter_col + columns.comments + columns.types string:string +#### A masked pattern was here #### + name default.my_dim + numFiles 1 + numRows 4 + rawDataSize 692 + serialization.ddl struct my_dim { string join_col, string filter_col} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 338 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.my_dim + name: default.my_dim + Truncated Path -> Alias: + /my_dim [my_dim] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + diff --git a/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out b/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out index 9929989f0e..1809fa77d1 100644 --- a/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out +++ b/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out @@ -592,7 +592,7 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 1 <- Map 4 (CUSTOM_EDGE) + Map 1 <- Map 4 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### @@ -721,7 +721,7 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 1 <- Map 4 (CUSTOM_EDGE) + Map 1 <- Map 4 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### @@ -853,7 +853,7 @@ POSTHOOK: Input: default@tab_n11@ds=2008-04-08 POSTHOOK: Input: default@tab_part_n12 POSTHOOK: Input: default@tab_part_n12@ds=2008-04-08 #### A masked pattern was here #### -9 +40 PREHOOK: query: explain select count(*) from tab_n11 a join tab_part_n12 b on a.value = b.value PREHOOK: type: QUERY POSTHOOK: query: explain select count(*) from tab_n11 a join tab_part_n12 b on a.value = b.value @@ -1426,7 +1426,7 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 1 <- Map 2 (CUSTOM_EDGE) + Map 1 <- Map 2 (BROADCAST_EDGE) Map 3 <- Map 1 (CUSTOM_EDGE) Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### @@ -1549,7 +1549,7 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 1 <- Map 2 (CUSTOM_EDGE) + Map 1 <- Map 2 (BROADCAST_EDGE) Map 3 <- Map 1 (CUSTOM_EDGE) Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### @@ -1675,7 +1675,7 @@ POSTHOOK: Input: default@tab_n11@ds=2008-04-08 POSTHOOK: Input: default@tab_part_n12 POSTHOOK: Input: default@tab_part_n12@ds=2008-04-08 #### A masked pattern was here #### -9 +40 PREHOOK: query: explain select count(*) from (select s1.key as key, s1.value as value from tab_n11 s1 join tab_n11 s3 on s1.key=s3.key UNION ALL