Details
-
Bug
-
Status: Closed
-
Major
-
Resolution: Fixed
-
None
-
None
Description
select query result is different when enable/disable mapjoin with UNION ALL
Below are the reproduce steps.
As per query when map.join is disabled it should not give rows(duplicate). Same is working fine with map.join=true.
Expected result: Empty rows.
Problem: returning duplicate rows.
Steps:
----------
SET hive.server2.tez.queue.access.check=true;
SET tez.queue.name=default
SET hive.query.results.cache.enabled=false;
SET hive.fetch.task.conversion=none;
SET hive.execution.engine=tez;
SET hive.stats.autogather=true;
SET hive.server2.enable.doAs=false;
SET hive.auto.convert.join=false;
drop table if exists hive1_tbl_data;
drop table if exists hive2_tbl_data;
drop table if exists hive3_tbl_data;
drop table if exists hive4_tbl_data;
CREATE EXTERNAL TABLE hive1_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
TBLPROPERTIES (
'TRANSLATED_TO_EXTERNAL'='true',
'bucketing_version'='2',
'external.table.purge'='true',
'parquet.compression'='SNAPPY');
CREATE EXTERNAL TABLE hive2_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
TBLPROPERTIES (
'TRANSLATED_TO_EXTERNAL'='true',
'bucketing_version'='2',
'external.table.purge'='true',
'parquet.compression'='SNAPPY');
CREATE EXTERNAL TABLE hive3_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
TBLPROPERTIES (
'TRANSLATED_TO_EXTERNAL'='true',
'bucketing_version'='2',
'external.table.purge'='true',
'parquet.compression'='SNAPPY');
CREATE EXTERNAL TABLE hive4_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
TBLPROPERTIES (
'TRANSLATED_TO_EXTERNAL'='true',
'bucketing_version'='2',
'external.table.purge'='true',
'parquet.compression'='SNAPPY');
insert into table hive1_tbl_data select '00001','john','doe','john@hotmail.com','2014-01-01 12:01:02','4000-10000';
insert into table hive1_tbl_data select '00002','john','doe','john@hotmail.com','2014-01-01 12:01:02','4000-10000';insert into table hive2_tbl_data select '00001','john','doe','john@hotmail.com','2014-01-01 12:01:02','00001';
insert into table hive2_tbl_data select '00002','john','doe','john@hotmail.com','2014-01-01 12:01:02','00001';
select
t.COLUMID
from (
select distinct
t.COLUMID as COLUMID
from (SELECT COLUMID FROM hive3_tbl_data UNION ALL SELECT COLUMID FROM hive1_tbl_data) t
) t
left join (
select
distinct t.COLUMID
from (SELECT COLUMID FROM hive4_tbl_data UNION ALL SELECT COLUMID FROM hive2_tbl_data) t
) t1 on t.COLUMID = t1.COLUMID
where t1.COLUMID is null;
Attachments
Issue Links
- fixes
-
HIVE-24907 Wrong results with LEFT JOIN and subqueries with UNION and GROUP BY
- Closed
- links to