Details
-
Bug
-
Status: Closed
-
Critical
-
Resolution: Fixed
-
1.12.0
-
None
Description
Writing parquet file with version 1.12.0 in Apache Hive, then read that file, returns the following error:
Caused by: java.lang.IllegalStateException: All of the offsets in the split should be found in the file. expected: [4, 133961161] found: [BlockMetaData{1530100, 133961157 [ColumnMetaData{UNCOMPRESSED [c_customer_sk] optional int64 c_customer_sk [PLAIN, RLE, BIT_PACKED], 4}, ColumnMetaData{UNCOMPRESSED [c_customer_id] optional binary c_customer_id (STRING) [PLAIN, RLE, BIT_PACKED], 12243647}, ColumnMetaData{UNCOMPRESSED [c_current_cdemo_sk] optional int64 c_current_cdemo_sk [PLAIN, RLE, BIT_PACKED], 42848491}, ColumnMetaData{UNCOMPRESSED [c_current_hdemo_sk] optional int64 c_current_hdemo_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 54868535}, ColumnMetaData{UNCOMPRESSED [c_current_addr_sk] optional int64 c_current_addr_sk [PLAIN, RLE, BIT_PACKED], 57421932}, ColumnMetaData{UNCOMPRESSED [c_first_shipto_date_sk] optional int64 c_first_shipto_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 69694809}, ColumnMetaData{UNCOMPRESSED [c_first_sales_date_sk] optional int64 c_first_sales_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 72093040}, ColumnMetaData{UNCOMPRESSED [c_salutation] optional binary c_salutation (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 74461508}, ColumnMetaData{UNCOMPRESSED [c_first_name] optional binary c_first_name (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 75092758}, ColumnMetaData{UNCOMPRESSED [c_last_name] optional binary c_last_name (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 77626525}, ColumnMetaData{UNCOMPRESSED [c_preferred_cust_flag] optional binary c_preferred_cust_flag (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 80116456}, ColumnMetaData{UNCOMPRESSED [c_birth_day] optional int32 c_birth_day [RLE, PLAIN_DICTIONARY, BIT_PACKED], 80505351}, ColumnMetaData{UNCOMPRESSED [c_birth_month] optional int32 c_birth_month [RLE, PLAIN_DICTIONARY, BIT_PACKED], 81581772}, ColumnMetaData{UNCOMPRESSED [c_birth_year] optional int32 c_birth_year [RLE, PLAIN_DICTIONARY, BIT_PACKED], 82473740}, ColumnMetaData{UNCOMPRESSED [c_birth_country] optional binary c_birth_country (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 83921564}, ColumnMetaData{UNCOMPRESSED [c_login] optional binary c_login (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 85457674}, ColumnMetaData{UNCOMPRESSED [c_email_address] optional binary c_email_address (STRING) [PLAIN, RLE, BIT_PACKED], 85460523}, ColumnMetaData{UNCOMPRESSED [c_last_review_date_sk] optional int64 c_last_review_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 132146109}]}] at org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:172) ~[parquet-hadoop-bundle-1.12.0.jar:1.12.0] at org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140) ~[parquet-hadoop-bundle-1.12.0.jar:1.12.0] at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:95) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:60) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat.getRecordReader(MapredParquetInputFormat.java:89) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:96) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) ~[?:1.8.0_292] at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) ~[?:1.8.0_292] at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) ~[?:1.8.0_292] at java.lang.reflect.Constructor.newInstance(Constructor.java:423) ~[?:1.8.0_292] at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:254) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.<init>(HadoopShimsSecure.java:214) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getRecordReader(HadoopShimsSecure.java:342) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:716) ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT] at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:175) ~[hadoop-mapreduce-client-core-3.1.4.jar:?] at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:444) ~[hadoop-mapreduce-client-core-3.1.4.jar:?] at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349) ~[hadoop-mapreduce-client-core-3.1.4.jar:?] at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:271) ~[hadoop-mapreduce-client-common-3.1.4.jar:?] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_292] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_292] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_292] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_292] at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_292]
Repoduce Scenario:
TPC-DS table customer, any parquet file witten by 1.12.0 that larger than 128MB(two row groups).
create table if not exists customer( c_customer_sk bigint , c_customer_id char(16) , c_current_cdemo_sk bigint , c_current_hdemo_sk bigint , c_current_addr_sk bigint , c_first_shipto_date_sk bigint , c_first_sales_date_sk bigint , c_salutation char(10) , c_first_name char(20) , c_last_name char(30) , c_preferred_cust_flag char(1) , c_birth_day int , c_birth_month int , c_birth_year int , c_birth_country varchar(20) , c_login char(13) , c_email_address char(50) , c_last_review_date_sk bigint ) stored as parquet location 'file:///home/username/data/customer'; --after add file: select count(*) from customer;
Attachments
Attachments
Issue Links
- is related to
-
SPARK-34276 Check the unreleased/unresolved JIRAs/PRs of Parquet 1.11 and 1.12
- Resolved
- relates to
-
SPARK-36696 spark.read.parquet loads empty dataset
- Resolved
-
SPARK-36726 Upgrade Parquet to 1.12.1
- Resolved
- links to