Description
import numpy as np import pandas as pd df = self.spark.createDataFrame( [[[("a", 2, 3.0), ("a", 2, 3.0)]], [[("b", 5, 6.0), ("b", 5, 6.0)]]], "array_struct_col Array<struct<col1:string, col2:long, col3:double>>", ) for is_arrow_enabled in [True, False]: with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": is_arrow_enabled}): pdf = df.toPandas() self.assertEqual(type(pdf), pd.DataFrame) self.assertEqual(type(pdf["array_struct_col"]), pd.Series) if is_arrow_enabled: self.assertEqual(type(pdf["array_struct_col"][0]), np.ndarray) else: self.assertEqual(type(pdf["array_struct_col"][0]), list)
Traceback (most recent call last): 1415 File "/__w/spark/spark/python/pyspark/sql/tests/test_dataframe.py", line 1202, in test_to_pandas_for_array_of_struct 1416 df = self.spark.createDataFrame( 1417 File "/__w/spark/spark/python/pyspark/sql/connect/session.py", line 264, in createDataFrame 1418 table = pa.Table.from_pylist([dict(zip(_cols, list(item))) for item in _data]) 1419 File "pyarrow/table.pxi", line 3700, in pyarrow.lib.Table.from_pylist 1420 File "pyarrow/table.pxi", line 5221, in pyarrow.lib._from_pylist 1421 File "pyarrow/table.pxi", line 3575, in pyarrow.lib.Table.from_arrays 1422 File "pyarrow/table.pxi", line 1383, in pyarrow.lib._sanitize_arrays 1423 File "pyarrow/table.pxi", line 1364, in pyarrow.lib._schema_from_arrays 1424 File "pyarrow/array.pxi", line 320, in pyarrow.lib.array 1425 File "pyarrow/array.pxi", line 39, in pyarrow.lib._sequence_to_array 1426 File "pyarrow/error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status 1427 File "pyarrow/error.pxi", line 123, in pyarrow.lib.check_status 1428pyarrow.lib.ArrowTypeError: Expected bytes, got a 'int' object
import numpy as np pdf = self._to_pandas() types = pdf.dtypes self.assertEqual(types[0], np.int32) self.assertEqual(types[1], np.object) self.assertEqual(types[2], np.bool) self.assertEqual(types[3], np.float32) self.assertEqual(types[4], np.object) # datetime.date self.assertEqual(types[5], "datetime64[ns]") self.assertEqual(types[6], "datetime64[ns]") self.assertEqual(types[7], "timedelta64[ns]")
Traceback (most recent call last): 1434 File "/__w/spark/spark/python/pyspark/sql/tests/test_dataframe.py", line 1039, in test_to_pandas 1435 self.assertEqual(types[5], "datetime64[ns]") 1436AssertionError: datetime64[ns, Etc/UTC] != 'datetime64[ns]' 1437
Attachments
Attachments
Issue Links
- is a clone of
-
SPARK-41876 Implement DataFrame `toLocalIterator`
- Resolved
- links to