Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Fixed
-
None
Description
PyObjectStringify doesn't handle non-string(bytes or utf-8) type correctly. Should use PyObject_Repr(or PyObject_Str) to get string representation of PyObject.
struct ARROW_EXPORT PyObjectStringify { OwnedRef tmp_obj; const char* bytes; Py_ssize_t size; explicit PyObjectStringify(PyObject* obj) { PyObject* bytes_obj; if (PyUnicode_Check(obj)) { bytes_obj = PyUnicode_AsUTF8String(obj); tmp_obj.reset(bytes_obj); bytes = PyBytes_AsString(bytes_obj); size = PyBytes_GET_SIZE(bytes_obj); } else if (PyBytes_Check(obj)) { bytes = PyBytes_AsString(obj); size = PyBytes_GET_SIZE(obj); } else { bytes = NULLPTR; size = -1; } } };
should change to
struct ARROW_EXPORT PyObjectStringify { OwnedRef tmp_obj; const char* bytes; Py_ssize_t size; explicit PyObjectStringify(PyObject* obj) { PyObject* bytes_obj; if (PyUnicode_Check(obj)) { bytes_obj = PyUnicode_AsUTF8String(obj); tmp_obj.reset(bytes_obj); bytes = PyBytes_AsString(bytes_obj); size = PyBytes_GET_SIZE(bytes_obj); } else if (PyBytes_Check(obj)) { bytes = PyBytes_AsString(obj); size = PyBytes_GET_SIZE(obj); } else { bytes_obj = PyObject_Repr(obj); tmp_obj.reset(bytes_obj); bytes = PyBytes_AsString(bytes_obj); size = PyBytes_GET_SIZE(bytes_obj); } } };
How do this infect pyarrow? Minimal reproduction case:
import pyarrow data = ['-10', '-5', {'a': 1}, '0', '5', '10'] arr = pyarrow.array(data, type=pyarrow.string()) [1] 64491 segmentation fault ipython
This case is found by my colleague. I would ask him to send a pr here.
cc wesmckinn
Attachments
Issue Links
- links to