Uploaded image for project: 'Hive'
  1. Hive
  2. HIVE-20525

Hive UNION ALL query gives wrong results with hive.optimize.skewjoin=true

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • 1.2.1, 2.1.1
    • None
    • Query Planning
    • None

    Description

      Following query gives 0 rows instead of the expected 3 rows. Tested on dataproc 1.2.47-deb8 with the default hive version 2.1.1. The query throws an ArrayIndexOutOfBoundsException if hive.auto.convert.join=true.

      On HDP2.6.4 sandbox, hive 1.2.1 version this gives 2 rows instead of 3

      
      set hive.optimize.skewjoin=true;
      set hive.auto.convert.join=false;
      
      CREATE TABLE t_ins (user_id bigint, app_id string, store string, oem string ,dt string);
      CREATE TABLE t_fa (user_id bigint, app_type string, build_name string, dt string);
      CREATE TABLE t_la (user_id bigint, oem string, dt string);
      CREATE TABLE t_tu (user_id bigint, new_user_id bigint, dt string);
      CREATE TABLE t_duc (country_code string, user_id bigint);
      CREATE TABLE t_app (name string, app_id string, dt string);
      
      INSERT INTO t_ins VALUES (1234567, 'xyzuvw','store1','OEM1','20180814');
      INSERT INTO t_app VALUES ('app1','xyzuvw','20180814');
      
      INSERT INTO t_fa VALUES (1234567,'app1','store1','20180814');
      INSERT INTO t_la VALUES (1234567,'OEM1','20180814');
      
      INSERT INTO t_tu VALUES (1234567,1111111,'20180814');
      INSERT INTO t_duc VALUES ('IN',1234567);
      
      
      INSERT INTO t_fa VALUES (7654321,'app2','store2','20180814');
      INSERT INTO t_la VALUES (7654321,'OEM2','20180814');
      
      INSERT INTO t_tu VALUES (7654321,1111111,'20180814');
      INSERT INTO t_duc VALUES ('SE',7654321);
      
      
      CREATE TABLE `abs`(
      `country_code` string, 
      `app_type` string, 
      `build_name` string, 
      `device` string, 
      `new_user_id` string, 
      `user_type_1` smallint, 
      `user_type_2` smallint)
      PARTITIONED BY ( 
      `dt` string)
      ROW FORMAT SERDE 
      'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
      STORED AS INPUTFORMAT 
      'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
      OUTPUTFORMAT 
      'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
      ;
      
      
      INSERT OVERWRITE TABLE abs PARTITION (dt=20180814)
      
      SELECT
      duc.country_code country_code,
      app.name app_type,
      i.store build_name,
      i.oem device,
      tu.new_user_id new_user_id,
      1 user_type_1,
      0 user_type_2
      FROM
      t_ins i
      JOIN t_app app ON app.app_id = i.app_id AND app.dt = 20180814
      LEFT JOIN t_duc duc on i.user_id = duc.user_id
      LEFT JOIN t_tu tu on tu.user_id = i.user_id and tu.dt = 20180814
      WHERE
      i.dt = 20180814
      GROUP BY
      duc.country_code,
      app.name,
      i.store,
      i.oem,
      tu.new_user_id
      
      UNION ALL
      
      SELECT
      duc.country_code country_code,
      fa.app_type app_type,
      fa.build_name build_name,
      la.oem device,
      tu.new_user_id new_user_id,
      0 user_type_1,
      1 user_type_2
      FROM
      t_fa fa
      LEFT JOIN t_duc duc on fa.user_id = duc.user_id
      LEFT JOIN t_tu tu on tu.user_id = fa.user_id and tu.dt = 20180814
      LEFT JOIN t_la la ON fa.user_id = la.user_id AND la.dt = 20180814
      WHERE
      fa.dt = 20180814
      GROUP BY
      duc.country_code,
      fa.app_type,
      fa.build_name,
      la.oem,
      tu.new_user_id
      ;
      
      select * from abs where dt=20180814;
      
      

      Attachments

        Activity

          People

            Unassigned Unassigned
            dhaneshp Dhanesh Padmanabhan
            Votes:
            1 Vote for this issue
            Watchers:
            2 Start watching this issue

            Dates

              Created:
              Updated: