When using pa.Table.from_pandas() with preserve_index=True and dataframe.index.name!=None the prefix _index_level is not added to the respective schema name. This breaks write_to_dataset with active partition columns.
import pyarrow as pa import pyarrow.parquet as pq import os import shutil import pandas as pd import numpy as np PATH_PYARROW_MANUAL = '/tmp/pyarrow_manual.pa/' if os.path.exists(PATH_PYARROW_MANUAL): shutil.rmtree(PATH_PYARROW_MANUAL) os.mkdir(PATH_PYARROW_MANUAL) arrays = np.array([np.array([0, 1, 2]), np.array([3, 4]), np.nan, np.nan]) df = pd.DataFrame([0, 0, 1, 1], columns=['partition_column']) df['arrays'] = pd.Series(arrays) df.index.name='ID' table = pa.Table.from_pandas(df, preserve_index=True) print(table.schema.names) pq.write_to_dataset(table, root_path=PATH_PYARROW_MANUAL, partition_cols=['partition_column'], preserve_index=True )
Removing df.index.name='ID' works. Also disabling partition_cols in write_to_dataset works.
- links to