Uploaded image for project: 'Apache Arrow'
  1. Apache Arrow
  2. ARROW-1863

[Python] PyObjectStringify could render bytes-like output for more types of objects

Details

    Description

      PyObjectStringify doesn't handle non-string(bytes or utf-8) type correctly. Should use PyObject_Repr(or PyObject_Str) to get string representation of PyObject.

      struct ARROW_EXPORT PyObjectStringify {
        OwnedRef tmp_obj;
        const char* bytes;
        Py_ssize_t size;
      
        explicit PyObjectStringify(PyObject* obj) {
          PyObject* bytes_obj;
          if (PyUnicode_Check(obj)) {
            bytes_obj = PyUnicode_AsUTF8String(obj);
            tmp_obj.reset(bytes_obj);
            bytes = PyBytes_AsString(bytes_obj);
            size = PyBytes_GET_SIZE(bytes_obj);
          } else if (PyBytes_Check(obj)) {
            bytes = PyBytes_AsString(obj);
            size = PyBytes_GET_SIZE(obj);
          } else {
            bytes = NULLPTR;
            size = -1;
          }
        }
      };
      

      should change to

      struct ARROW_EXPORT PyObjectStringify {
        OwnedRef tmp_obj;
        const char* bytes;
        Py_ssize_t size;
      
        explicit PyObjectStringify(PyObject* obj) {
          PyObject* bytes_obj;
          if (PyUnicode_Check(obj)) {
            bytes_obj = PyUnicode_AsUTF8String(obj);
            tmp_obj.reset(bytes_obj);
            bytes = PyBytes_AsString(bytes_obj);
            size = PyBytes_GET_SIZE(bytes_obj);
          } else if (PyBytes_Check(obj)) {
            bytes = PyBytes_AsString(obj);
            size = PyBytes_GET_SIZE(obj);
          } else {
            bytes_obj = PyObject_Repr(obj);
            tmp_obj.reset(bytes_obj);
            bytes = PyBytes_AsString(bytes_obj);
            size = PyBytes_GET_SIZE(bytes_obj);
          }
        }
      };
      

      How do this infect pyarrow? Minimal reproduction case:

      import pyarrow
      
      data = ['-10', '-5', {'a': 1}, '0', '5', '10']
      
      arr = pyarrow.array(data, type=pyarrow.string())
      
      [1]    64491 segmentation fault  ipython
      

      This case is found by my colleague. I would ask him to send a pr here.

      cc [~wesmckinn]

      Attachments

        Issue Links

          Activity

            People

              cpcloud Phillip Cloud
              advancedxy YE
              Votes:
              0 Vote for this issue
              Watchers:
              6 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Slack

                  Issue deployment