Details
-
Bug
-
Status: Resolved
-
Critical
-
Resolution: Fixed
-
0.13.0
-
osx python36/conda cloudpickle 0.8.1
arrow-cpp 0.13.0 py36ha71616b_0 conda-forge
pyarrow 0.13.0 py36hb37e6aa_0 conda-forge
Description
Since 0.13.0, parquet instances are no longer serialisable, which means that dask.distributed cannot pass them between processes in order to load parquet in parallel.
Example:
```
>>> import cloudpickle
>>> import pyarrow.parquet as pq
>>> pf = pq.ParquetDataset('nation.impala.parquet')
>>> cloudpickle.dumps(pf)
~/anaconda/envs/py36/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dumps(obj, protocol)
893 try:
894 cp = CloudPickler(file, protocol=protocol)
--> 895 cp.dump(obj)
896 return file.getvalue()
897 finally:
~/anaconda/envs/py36/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dump(self, obj)
266 self.inject_addons()
267 try:
--> 268 return Pickler.dump(self, obj)
269 except RuntimeError as e:
270 if 'recursion' in e.args[0]:
~/anaconda/envs/py36/lib/python3.6/pickle.py in dump(self, obj)
407 if self.proto >= 4:
408 self.framer.start_framing()
--> 409 self.save(obj)
410 self.write(STOP)
411 self.framer.end_framing()
~/anaconda/envs/py36/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
519
520 # Save the reduce() output and finally memoize the object
--> 521 self.save_reduce(obj=obj, *rv)
522
523 def persistent_id(self, obj):
~/anaconda/envs/py36/lib/python3.6/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
632
633 if state is not None:
--> 634 save(state)
635 write(BUILD)
636
~/anaconda/envs/py36/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
474 f = self.dispatch.get(t)
475 if f is not None:
--> 476 f(self, obj) # Call unbound method with explicit self
477 return
478
~/anaconda/envs/py36/lib/python3.6/pickle.py in save_dict(self, obj)
819
820 self.memoize(obj)
--> 821 self._batch_setitems(obj.items())
822
823 dispatch[dict] = save_dict
~/anaconda/envs/py36/lib/python3.6/pickle.py in _batch_setitems(self, items)
845 for k, v in tmp:
846 save(k)
--> 847 save(v)
848 write(SETITEMS)
849 elif n:
~/anaconda/envs/py36/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
494 reduce = getattr(obj, "_reduce_ex_", None)
495 if reduce is not None:
--> 496 rv = reduce(self.proto)
497 else:
498 reduce = getattr(obj, "_reduce_", None)
~/anaconda/envs/py36/lib/python3.6/site-packages/pyarrow/parquet.cpython-36m-darwin.so in pyarrow._parquet.ParquetSchema.reduce_cython_()
TypeError: no default _reduce_ due to non-trivial _cinit_
```
The indicated schema instance is also referenced by the ParquetDatasetPiece s.
Attachments
Attachments
Issue Links
- links to