Uploaded image for project: 'Hive'
  1. Hive
  2. HIVE-20825

Hive ACID Merge generates invalid ORC files (bucket files 0 or 3 bytes in length) causing the "Not a valid ORC file" error

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • 2.3.1, 2.3.2, 2.3.4
    • None
    • Hive, ORC, Transactions
    • None
    • Hive 2.3.x on Amazon EMR 5.8.0 to 5.18.0. Open source build of Hive 2.3.4

    Description

      When using Hive ACID Merge (supported with the ORC format) to update/insert data, bucket files with 0 byte or 3 bytes (file content contains three characters: ORC) are generated during MERGE INTO operations which finish with no errors. Subsequent queries on the base table will get "Not a valid ORC file" error.

       

      The following script can be used to reproduce the issue(note that with small amount of data like this increasing the number of buckets could result in query working, but with large data set it will fail no matter what bucket size):

      set hive.auto.convert.join=false;
      set hive.enforce.bucketing=true;
      set hive.exec.dynamic.partition.mode = nonstrict;
      set hive.support.concurrency=true;
      set hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;

      drop table if exists mergedelta_txt_1;
      drop table if exists mergedelta_txt_2;

      CREATE TABLE mergedelta_txt_1 (
      id_str varchar(12), time_key int, value bigint)
      PARTITIONED BY (date_key int)
      ROW FORMAT DELIMITED
      STORED AS TEXTFILE;

      CREATE TABLE mergedelta_txt_2 (
      id_str varchar(12), time_key int, value bigint)
      PARTITIONED BY (date_key int)
      ROW FORMAT DELIMITED
      STORED AS TEXTFILE;

      INSERT INTO TABLE mergedelta_txt_1
      partition(date_key=20170103)
      VALUES
      ("AB94LIENR0",46700,12345676836978),
      ("AB94LIENR1",46825,12345676836978),
      ("AB94LIENS0",46709,12345676836978),
      ("AB94LIENS1",46834,12345676836978),
      ("AB94LIENT0",46709,12345676836978),
      ("AB94LIENT1",46834,12345676836978),
      ("AB94LIENU0",46718,12345676836978),
      ("AB94LIENU1",46844,12345676836978),
      ("AB94LIENV0",46719,12345676836978),
      ("AB94LIENV1",46844,12345676836978),
      ("AB94LIENW0",46728,12345676836978),
      ("AB94LIENW1",46854,12345676836978),
      ("AB94LIENX0",46728,12345676836978),
      ("AB94LIENX1",46854,12345676836978),
      ("AB94LIENY0",46737,12345676836978),
      ("AB94LIENY1",46863,12345676836978),
      ("AB94LIENZ0",46738,12345676836978),
      ("AB94LIENZ1",46863,12345676836978),
      ("AB94LIERA0",47176,12345676836982),
      ("AB94LIERA1",47302,12345676836982);

      INSERT INTO TABLE mergedelta_txt_2
      partition(date_key=20170103)
      VALUES
      ("AB94LIENT1",46834,12345676836978),
      ("AB94LIENU0",46718,12345676836978),
      ("AB94LIENU1",46844,12345676836978),
      ("AB94LIENV0",46719,12345676836978),
      ("AB94LIENV1",46844,12345676836978),
      ("AB94LIENW0",46728,12345676836978),
      ("AB94LIENW1",46854,12345676836978),
      ("AB94LIENX0",46728,12345676836978),
      ("AB94LIENX1",46854,12345676836978),
      ("AB94LIENY0",46737,12345676836978),
      ("AB94LIENY1",46863,12345676836978),
      ("AB94LIENZ0",46738,12345676836978),
      ("AB94LIENZ1",46863,12345676836978),
      ("AB94LIERA0",47176,12345676836982),
      ("AB94LIERA1",47302,12345676836982),
      ("AB94LIERA2",47418,12345676836982),
      ("AB94LIERB0",47176,12345676836982),
      ("AB94LIERB1",47302,12345676836982),
      ("AB94LIERB2",47418,12345676836982),
      ("AB94LIERC0",47185,12345676836982);

      DROP TABLE IF EXISTS mergebase_1;
      CREATE TABLE mergebase_1 (
      id_str varchar(12) , time_key int , value bigint)
      PARTITIONED BY (date_key int)
      CLUSTERED BY (id_str,time_key) INTO 4 BUCKETS
      STORED AS ORC
      TBLPROPERTIES (
      'orc.compress'='SNAPPY',
      'pk_columns'='id_str,date_key,time_key',
      'NO_AUTO_COMPACTION'='true',
      'transactional'='true');

      MERGE INTO mergebase_1 AS base
      USING (SELECT *
      FROM (
      SELECT id_str ,time_key ,value, date_key, rank() OVER (PARTITION BY id_str,date_key,time_key ORDER BY id_str,date_key,time_key) AS rk
      FROM mergedelta_txt_1
      DISTRIBUTE BY date_key
      ) rankedtbl
      WHERE rankedtbl.rk=1
      ) AS delta
      ON delta.id_str=base.id_str AND delta.date_key=base.date_key AND delta.time_key=base.time_key
      WHEN MATCHED THEN UPDATE SET value=delta.value
      WHEN NOT MATCHED THEN INSERT VALUES ( delta.id_str , delta.time_key , delta.value, delta.date_key);

      MERGE INTO mergebase_1 AS base
      USING (SELECT *
      FROM (
      SELECT id_str ,time_key ,value, date_key, rank() OVER (PARTITION BY id_str,date_key,time_key ORDER BY id_str,date_key,time_key) AS rk
      FROM mergedelta_txt_2
      DISTRIBUTE BY date_key
      ) rankedtbl
      WHERE rankedtbl.rk=1
      ) AS delta
      ON delta.id_str=base.id_str AND delta.date_key=base.date_key AND delta.time_key=base.time_key
      WHEN MATCHED THEN UPDATE SET value=delta.value
      WHEN NOT MATCHED THEN INSERT VALUES ( delta.id_str , delta.time_key , delta.value, delta.date_key);

      select count from mergebase_1;

      Attachments

        1. hive-merge-invalid-orc-repro.hql
          4 kB
          Tom Zeng
        2. hive-merge-invalid-orc-repro.log
          46 kB
          Tom Zeng

        Activity

          People

            Unassigned Unassigned
            tomzeng Tom Zeng
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated: