XMLWordPrintableJSON

Details

    • Sub-task
    • Status: Resolved
    • Major
    • Resolution: Fixed
    • 3.4.0
    • 3.4.0
    • Connect
    • None

    Description

      import numpy as np
      import pandas as pd
      
      df = self.spark.createDataFrame(
          [[[("a", 2, 3.0), ("a", 2, 3.0)]], [[("b", 5, 6.0), ("b", 5, 6.0)]]],
          "array_struct_col Array<struct<col1:string, col2:long, col3:double>>",
      )
      for is_arrow_enabled in [True, False]:
          with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": is_arrow_enabled}):
              pdf = df.toPandas()
              self.assertEqual(type(pdf), pd.DataFrame)
              self.assertEqual(type(pdf["array_struct_col"]), pd.Series)
              if is_arrow_enabled:
                  self.assertEqual(type(pdf["array_struct_col"][0]), np.ndarray)
              else:
                  self.assertEqual(type(pdf["array_struct_col"][0]), list)
      Traceback (most recent call last):
      1415  File "/__w/spark/spark/python/pyspark/sql/tests/test_dataframe.py", line 1202, in test_to_pandas_for_array_of_struct
      1416    df = self.spark.createDataFrame(
      1417  File "/__w/spark/spark/python/pyspark/sql/connect/session.py", line 264, in createDataFrame
      1418    table = pa.Table.from_pylist([dict(zip(_cols, list(item))) for item in _data])
      1419  File "pyarrow/table.pxi", line 3700, in pyarrow.lib.Table.from_pylist
      1420  File "pyarrow/table.pxi", line 5221, in pyarrow.lib._from_pylist
      1421  File "pyarrow/table.pxi", line 3575, in pyarrow.lib.Table.from_arrays
      1422  File "pyarrow/table.pxi", line 1383, in pyarrow.lib._sanitize_arrays
      1423  File "pyarrow/table.pxi", line 1364, in pyarrow.lib._schema_from_arrays
      1424  File "pyarrow/array.pxi", line 320, in pyarrow.lib.array
      1425  File "pyarrow/array.pxi", line 39, in pyarrow.lib._sequence_to_array
      1426  File "pyarrow/error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status
      1427  File "pyarrow/error.pxi", line 123, in pyarrow.lib.check_status
      1428pyarrow.lib.ArrowTypeError: Expected bytes, got a 'int' object

       

      import numpy as np
      
      pdf = self._to_pandas()
      types = pdf.dtypes
      self.assertEqual(types[0], np.int32)
      self.assertEqual(types[1], np.object)
      self.assertEqual(types[2], np.bool)
      self.assertEqual(types[3], np.float32)
      self.assertEqual(types[4], np.object)  # datetime.date
      self.assertEqual(types[5], "datetime64[ns]")
      self.assertEqual(types[6], "datetime64[ns]")
      self.assertEqual(types[7], "timedelta64[ns]") 
      Traceback (most recent call last): 1434 File "/__w/spark/spark/python/pyspark/sql/tests/test_dataframe.py", line 1039, in test_to_pandas 1435 self.assertEqual(types[5], "datetime64[ns]") 1436AssertionError: datetime64[ns, Etc/UTC] != 'datetime64[ns]' 1437
      

      Attachments

        Issue Links

          Activity

            People

              gurwls223 Hyukjin Kwon
              techaddict Sandeep Singh
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: