Details
-
Bug
-
Status: Open
-
Minor
-
Resolution: Unresolved
-
1.0.1
-
None
Description
Going off the docs https://arrow.apache.org/docs/python/dataset.html#manual-specification-of-the-dataset but instead using date partitioning. If you create the partitions using pandas Timestamps you get timestamp[ns] vs timestamp[us] type errors.
import tempfile import pathlib import numpy as np import pandas as pd import pyarrow as pa import pyarrow.dataset as ds import pyarrow.parquet as pq from pyarrow import fs base = pathlib.Path(tempfile.gettempdir()) table = pa.table({"col1": range(3), "col2": np.random.randn(3)}) (base / "parquet_dataset_manual").mkdir(exist_ok=True) pq.write_table(table, base / "parquet_dataset_manual" / "data_20180101.parquet") pq.write_table(table, base / "parquet_dataset_manual" / "data_20180102.parquet") schema = pa.schema([("date", pa.timestamp("ns")), ("col1", pa.int64()), ("col2", pa.float64())]) dataset = ds.FileSystemDataset.from_paths( ["data_20180101.parquet", "data_20180102.parquet"], schema=schema, format=ds.ParquetFileFormat(), filesystem=fs.SubTreeFileSystem(str(base / "parquet_dataset_manual"), fs.LocalFileSystem()), partitions=[ds.field("date") == pd.Timestamp("2018-01-01"), ds.field("date") == pd.Timestamp("2018-01-01")], ) print(dataset.to_table().to_pandas()) # pyarrow.lib.ArrowTypeError: field date: timestamp[ns] cannot be materialized from scalar of type timestamp[us] print(dataset.to_table(filter=ds.field("date") == pd.Timestamp("2018-01-01")).to_pandas()) # ../src/arrow/result.cc:28: ValueOrDie called on an error: Type error: Cannot compare scalars of differing type: timestamp[ns] vs timestamp[us] dataset = ds.FileSystemDataset.from_paths( ["data_20180101.parquet", "data_20180102.parquet"], schema=schema, format=ds.ParquetFileFormat(), filesystem=fs.SubTreeFileSystem(str(base / "parquet_dataset_manual"), fs.LocalFileSystem()), partitions=[ ds.field("date") == pa.scalar(pd.Timestamp("2018-01-01"), pa.timestamp("ns")), ds.field("date") == pa.scalar(pd.Timestamp("2018-01-02"), pa.timestamp("ns")), ], ) print(dataset.to_table().to_pandas()) print(dataset.to_table(filter=ds.field("date") == pd.Timestamp("2018-01-01")).to_pandas())