Description
In the 4 table join scenario, when join on is set to "or" for filtering and is a fixed value condition, data is incorrectly listed.
The problem is reproduced as follows:
CREATE TABLE `table1`(
`column1` varchar(90),
`column2` varchar(11) ,
`column3` varchar(18),
`column4` varchar(3) ,
`column5` varchar(12) ,
`column6` varchar(29) ,
`column7` varchar(8) ,
`column8` varchar(11) ,
`column9` varchar(5),
`column10` varchar(2) )
PARTITIONED BY (
`pt_dt` varchar(10) )
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
WITH SERDEPROPERTIES (
'field.delim'='27',
'serialization.format'='27')
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
CREATE TABLE `table2`(
`column2_1` varchar(9) ,
`column2_2` varchar(30) ,
`column2_3` varchar(30),
`column2_4` varchar(90) ,
`column2_5` varchar(15))
PARTITIONED BY (
`pt_dt` varchar(10))
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
WITH SERDEPROPERTIES (
'field.delim'='27',
'serialization.format'='27')
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
CREATE TABLE `table3`(
`column3_1` varchar(30) ,
`column3_2` varchar(30) ,
`column3_3` varchar(30) ,
`column3_4` varchar(30),
`column3_5` varchar(30))
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
WITH SERDEPROPERTIES (
'field.delim'='27',
'serialization.format'='27')
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
CREATE TABLE `table4`(
`column4_2` string ,
`column4_3` string ,
`column4_1` string ,
`column4_4` string )
PARTITIONED BY (
`pt_dt` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='27',
'serialization.format'='27')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ;
Insert Reproduction Data:
insert into table1 partition (pt_dt='2023-11-30') (column6,column3,column7,column2,column5,column10,column1,column8,column9,column4) values (16160200980000057,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(27120403980000164,0140010000296,00002,0000001,'2022-11-30',1,1,1234567,12345,1),(26040204980001179,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(20100213980049933,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(16070091980002440,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(16070017980004555,0030100000004,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(06060822980002332,0030100000013,00001,0000001,'2022-11-30',1,1,1234567,12345,1);
insert into table2 partition (pt_dt='2023-11-30') (column2_1,column2_4,column2_2,column2_3,column2_5) values ('S6',1,016160020000000055,027120000600001061,0161600203),('S6',1,027120000600001061,016160020000000055,0271200006),('S6',1,026040000700001541,026040000700001541,0260400007),('S6',1,020100025900019726,016070001700001229,0201000259),('S6',1,016070009100000471,016070001700001229,0160700091),('S6',1,016070001700001229,016070009100000471,0160700296),('S6',1,006060000500001050,016070009100000471,0060600005);
Run the reproduction SQL statement:
SELECT
T2.column2_5
,T2.column2_2
,T6.column3_5
,T6.column3_2
,T6.column3_4
,T6.column3_3
,T1.column10
,T2.column2_4
FROM (SELECT
column8
,column1
,column5
,column9
,column4
,PT_DT
,column10
,column10 AS column10_2
FROM table1
WHERE PT_DT= '2023-11-30'
) T1
INNER JOIN (SELECT column2_4
,column2_3
,column2_2
,column2_5
FROM table2 A
WHERE PT_DT = '2023-11-30'
)T2
ON T1.column1 = T2.column2_4
LEFT JOIN (SELECT column4_1,column4_2,column4_3,column4_4
FROM table4
WHERE PT_DT='2023-11-30'
) D
on ( D.column4_4 = 1 ) OR ( T1.column10_2 = '0')
LEFT JOIN ( SELECT
F.column3_1
,F.column3_2
,F.column3_3
,F.column3_4
,F.column3_5
FROM table3 F
) T6
ON T2.column2_5 = T6.column3_1
WHERE D.column4_3 IS NULL ;
Expected data:
Abnormal data: