Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Fixed
-
None
Description
Reproduction from shadowdsp
import io import pandas as pd import pyarrow as pa pa.jemalloc_set_decay_ms(0) import pyarrow.parquet as pq from memory_profiler import profile @profile def read_file(f): table = pq.read_table(f) df = table.to_pandas(strings_to_categorical=True) del table del df def main(): rows = 2000000 df = pd.DataFrame({ "string": [{"test": [1, 2], "test1": [3, 4]}] * rows, "int": [5] * rows, "float": [2.0] * rows, }) table = pa.Table.from_pandas(df, preserve_index=False) parquet_stream = io.BytesIO() pq.write_table(table, parquet_stream) for i in range(3): parquet_stream.seek(0) read_file(parquet_stream) if __name__ == '__main__': main()
Attachments
Issue Links
- links to