Uploaded image for project: 'Hive'
  1. Hive
  2. HIVE-27303

select query result is different when enable/disable mapjoin with UNION ALL

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Major
    • Resolution: Fixed
    • None
    • 4.0.0
    • None

    Description

      select query result is different when enable/disable mapjoin with UNION ALL

      Below are the reproduce steps.

      As per query when map.join is disabled it should not give rows(duplicate). Same is working fine with map.join=true.

      Expected result: Empty rows.

      Problem: returning duplicate rows.

      Steps:

      ----------

      SET hive.server2.tez.queue.access.check=true;
      SET tez.queue.name=default
      SET hive.query.results.cache.enabled=false;
      SET hive.fetch.task.conversion=none;
      SET hive.execution.engine=tez;
      SET hive.stats.autogather=true;
      SET hive.server2.enable.doAs=false;
      SET hive.auto.convert.join=false;

      drop table if exists hive1_tbl_data;
      drop table if exists hive2_tbl_data;
      drop table if exists hive3_tbl_data;
      drop table if exists hive4_tbl_data;

      CREATE EXTERNAL TABLE hive1_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) 
       ROW FORMAT SERDE                                   
         'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  
       STORED AS INPUTFORMAT                              
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'  
       OUTPUTFORMAT                                       
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' 
       TBLPROPERTIES (                                    
         'TRANSLATED_TO_EXTERNAL'='true',                 
         'bucketing_version'='2',                         
         'external.table.purge'='true',                   
         'parquet.compression'='SNAPPY');

      CREATE EXTERNAL TABLE hive2_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) 
       ROW FORMAT SERDE                                   
         'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  
       STORED AS INPUTFORMAT                              
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'  
       OUTPUTFORMAT                                       
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' 
       TBLPROPERTIES (                                    
         'TRANSLATED_TO_EXTERNAL'='true',                 
         'bucketing_version'='2',                         
         'external.table.purge'='true',                   
         'parquet.compression'='SNAPPY');

      CREATE EXTERNAL TABLE hive3_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) 
       ROW FORMAT SERDE                                   
         'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  
       STORED AS INPUTFORMAT                              
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'  
       OUTPUTFORMAT                                       
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' 
       TBLPROPERTIES (                                    
         'TRANSLATED_TO_EXTERNAL'='true',                 
         'bucketing_version'='2',                         
         'external.table.purge'='true',                   
         'parquet.compression'='SNAPPY');

         CREATE EXTERNAL TABLE hive4_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) 
       ROW FORMAT SERDE                                   
         'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  
       STORED AS INPUTFORMAT                              
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'  
       OUTPUTFORMAT                                       
         'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' 
       TBLPROPERTIES (                                    
         'TRANSLATED_TO_EXTERNAL'='true',                 
         'bucketing_version'='2',                         
         'external.table.purge'='true',                   
         'parquet.compression'='SNAPPY');

       

      insert into table hive1_tbl_data select '00001','john','doe','john@hotmail.com','2014-01-01 12:01:02','4000-10000';

      insert into table hive1_tbl_data select '00002','john','doe','john@hotmail.com','2014-01-01 12:01:02','4000-10000';insert into table hive2_tbl_data select '00001','john','doe','john@hotmail.com','2014-01-01 12:01:02','00001'; 
      insert into table hive2_tbl_data select '00002','john','doe','john@hotmail.com','2014-01-01 12:01:02','00001'; 

       

      select
             t.COLUMID
        from (
            select distinct
                t.COLUMID as COLUMID
            from (SELECT COLUMID FROM hive3_tbl_data UNION ALL SELECT COLUMID FROM hive1_tbl_data) t
        ) t
        left join (
            select
                 distinct t.COLUMID
            from (SELECT COLUMID FROM hive4_tbl_data UNION ALL SELECT COLUMID FROM hive2_tbl_data) t
        ) t1 on t.COLUMID = t1.COLUMID
        where t1.COLUMID is null;

       

      Attachments

        Issue Links

          Activity

            People

              maheshrajus Mahesh Raju Somalaraju
              maheshrajus Mahesh Raju Somalaraju
              Votes:
              0 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: