Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Fixed
-
None
-
pyarrow 0.13, Windows 10
Description
This is pyarrow 0.13 on Windows.
import pandas as pd import pyarrow as pa import pyarrow.parquet as pq def make_table(num_rows): typ = pa.list_(pa.field("item", pa.float32(), False)) return pa.Table.from_arrays([ pa.array([[0] * (i%10) for i in range(0, num_rows)], type=typ), pa.array([[0] * ((i+5)%10) for i in range(0, num_rows)], type=typ) ], ['a', 'b']) pq.write_table(make_table(1000000), 'test.parquet') pq.read_table('test.parquet')
The last line throws the following exception:
--------------------------------------------------------------------------- ArrowInvalid Traceback (most recent call last) <ipython-input-4-0f3266afa36c> in <module> ----> 1 pq.read_table('full.parquet') ~\Anaconda3\lib\site-packages\pyarrow\parquet.py in read_table(source, columns, use_threads, metadata, use_pandas_metadata, memory_map, filesystem) 1150 return fs.read_parquet(path, columns=columns, 1151 use_threads=use_threads, metadata=metadata, -> 1152 use_pandas_metadata=use_pandas_metadata) 1153 1154 pf = ParquetFile(source, metadata=metadata) ~\Anaconda3\lib\site-packages\pyarrow\filesystem.py in read_parquet(self, path, columns, metadata, schema, use_threads, use_pandas_metadata) 179 filesystem=self) 180 return dataset.read(columns=columns, use_threads=use_threads, --> 181 use_pandas_metadata=use_pandas_metadata) 182 183 def open(self, path, mode='rb'): ~\Anaconda3\lib\site-packages\pyarrow\parquet.py in read(self, columns, use_threads, use_pandas_metadata) 1012 table = piece.read(columns=columns, use_threads=use_threads, 1013 partitions=self.partitions, -> 1014 use_pandas_metadata=use_pandas_metadata) 1015 tables.append(table) 1016 ~\Anaconda3\lib\site-packages\pyarrow\parquet.py in read(self, columns, use_threads, partitions, open_file_func, file, use_pandas_metadata) 562 table = reader.read_row_group(self.row_group, **options) 563 else: --> 564 table = reader.read(**options) 565 566 if len(self.partition_keys) > 0: ~\Anaconda3\lib\site-packages\pyarrow\parquet.py in read(self, columns, use_threads, use_pandas_metadata) 212 columns, use_pandas_metadata=use_pandas_metadata) 213 return self.reader.read_all(column_indices=column_indices, --> 214 use_threads=use_threads) 215 216 def scan_contents(self, columns=None, batch_size=65536): ~\Anaconda3\lib\site-packages\pyarrow\_parquet.pyx in pyarrow._parquet.ParquetReader.read_all() ~\Anaconda3\lib\site-packages\pyarrow\error.pxi in pyarrow.lib.check_status() ArrowInvalid: Column 1 named b expected length 932066 but got length 932063
Attachments
Issue Links
- is caused by
-
PARQUET-1652 [C++] ColumnWriter writes incorrect "num_values" metadata for nested types
- Closed
- links to