Details
-
Bug
-
Status: Resolved
-
Minor
-
Resolution: Fixed
-
9.0.0
-
None
-
Rust:
"arrow" = "21.0.0"
"parquet" = "21.0.0"
Python:
parquet-tools 0.2.11
pyarrow 9.0.0
Description
I'm generating Parquet V2 files with boolean column, but when trying to read them with pyarrow (parquet-tools or parq) I'm getting
OSError: Unknown encoding type.
To reproduce run following Rust program:
use arrow::json; use std::fs::File; const DATA: &'static str = r#" {"x": 1, "y": false} "#; fn main() -> anyhow::Result<()> { let mut json = json::ReaderBuilder::new().infer_schema(Some(2)) .build(std::io::Cursor::new(DATA.as_bytes()))?; let batch = json.next()?.unwrap(); let out_file = File::create("x.parquet")?; let props = parquet::file::properties::WriterProperties::builder() .set_writer_version( parquet::file::properties::WriterVersion::PARQUET_2_0) .build(); let mut writer = parquet::arrow::ArrowWriter::try_new( out_file, batch.schema(), Some(props))?; writer.write(&batch)?; writer.close()?; Ok(()) }
and try to show the output x.parquet file
$ cargo run $ parquet-tools show x.parquet Traceback (most recent call last): File "/home/nazgul/.local/bin/parquet-tools", line 8, in <module> sys.exit(main()) File "/home/nazgul/.local/lib/python3.10/site-packages/parquet_tools/cli.py", line 26, in main args.handler(args) File "/home/nazgul/.local/lib/python3.10/site-packages/parquet_tools/commands/show.py", line 59, in _cli with get_datafame_from_objs(pfs, args.head) as df: File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__ return next(self.gen) File "/home/nazgul/.local/lib/python3.10/site-packages/parquet_tools/commands/utils.py", line 190, in get_datafame_from_objs df: Optional[pd.DataFrame] = stack.enter_context(pf.get_dataframe()) File "/usr/lib/python3.10/contextlib.py", line 492, in enter_context result = _cm_type.__enter__(cm) File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__ return next(self.gen) File "/home/nazgul/.local/lib/python3.10/site-packages/parquet_tools/commands/utils.py", line 71, in get_dataframe yield pq.read_table(local_path).to_pandas() File "/home/nazgul/.local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 2827, in read_table return dataset.read(columns=columns, use_threads=use_threads, File "/home/nazgul/.local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 2473, in read table = self._dataset.to_table( File "pyarrow/_dataset.pyx", line 331, in pyarrow._dataset.Dataset.to_table File "pyarrow/_dataset.pyx", line 2577, in pyarrow._dataset.Scanner.to_table File "pyarrow/error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status File "pyarrow/error.pxi", line 115, in pyarrow.lib.check_status OSError: Unknown encoding type.