Details

    Description

      ======================================================================
      FAIL [1.071s]: test_pandas_udf_arrow_overflow (pyspark.sql.tests.connect.test_parity_pandas_udf.PandasUDFParityTests.test_pandas_udf_arrow_overflow)
      ----------------------------------------------------------------------
      pyspark.errors.exceptions.connect.PythonException: 
        An exception was thrown from the Python worker. Please see the stack trace below.
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 302, in _create_array
          return pa.Array.from_pandas(
                 ^^^^^^^^^^^^^^^^^^^^^
        File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
        File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
        File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
        File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
      pyarrow.lib.ArrowInvalid: Integer value 128 not in range: -128 to 127
      
      The above exception was the direct cause of the following exception:
      
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1834, in main
          process()
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1826, in process
          serializer.dump_stream(out_iter, outfile)
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 531, in dump_stream
          return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 104, in dump_stream
          for batch in iterator:
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 525, in init_stream_yield_batches
          batch = self._create_batch(series)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 511, in _create_batch
          arrs.append(self._create_array(s, t, arrow_cast=self._arrow_cast))
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 330, in _create_array
          raise PySparkValueError(error_msg % (series.dtype, series.na...
      
      During handling of the above exception, another exception occurred:
      
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf.py", line 299, in test_pandas_udf_arrow_overflow
          with self.assertRaisesRegex(
      AssertionError: "Exception thrown when converting pandas.Series" does not match "
        An exception was thrown from the Python worker. Please see the stack trace below.
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 302, in _create_array
          return pa.Array.from_pandas(
                 ^^^^^^^^^^^^^^^^^^^^^
        File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
        File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
        File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
        File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
      pyarrow.lib.ArrowInvalid: Integer value 128 not in range: -128 to 127
      
      The above exception was the direct cause of the following exception:
      
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1834, in main
          process()
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1826, in process
          serializer.dump_stream(out_iter, outfile)
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 531, in dump_stream
      
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf.py", line 279, in test_pandas_udf_detect_unsafe_type_conversion
          with self.assertRaisesRegex(
      AssertionError: "Exception thrown when converting pandas.Series" does not match "
        An exception was thrown from the Python worker. Please see the stack trace below.
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line [302](https://github.com/HyukjinKwon/spark/actions/runs/8916220872/job/24487232590#step:9:303), in _create_array
          return pa.Array.from_pandas(
                 ^^^^^^^^^^^^^^^^^^^^^
        File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
        File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
        File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
        File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
      pyarrow.lib.ArrowInvalid: Float value 0.5 was truncated converting to int32
      
      The above exception was the direct cause of the following exception:
      
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1834, in main
          process()
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1826, in process
          serializer.dump_stream(out_iter, outfile)
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 531, in dump_stream
          return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 104, in dump_stream
          for batch in iterator:
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 525, in init_stream_yield_batches
          batch = self._create_batch(series)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 511, in _create_batch
          arrs.append(self._create_array(s, t, arrow_cast=self._arrow_cast))
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 330, in _create_array
          raise PySparkValueError(error_msg % (series.dtype, ser..."
      
      ----------------------------------------------------------------------
      
      
      ======================================================================
      FAIL [0.162s]: test_vectorized_udf_exception (pyspark.sql.tests.connect.test_parity_pandas_udf_scalar.PandasUDFScalarParityTests.test_vectorized_udf_exception)
      ----------------------------------------------------------------------
      pyspark.errors.exceptions.connect.PythonException: 
        An exception was thrown from the Python worker. Please see the stack trace below.
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1834, in main
          process()
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1826, in process
          serializer.dump_stream(out_iter, outfile)
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 531, in dump_stream
          return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 104, in dump_stream
          for batch in iterator:
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 524, in init_stream_yield_batches
          for series in iterator:
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1734, in mapper
          result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1734, in <genexpr>
          result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 146, in <lambda>
          verify_result_length(verify_result_type(func(*a)), len(a[0])),
                                                  ^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/util.py", line 118, in wrapper
          return f(*args, **kwargs)
                 ^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py", line 650, in <lambda>
          scalar_raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
                                                             ~~^~...
      During handling of the above exception, another exception occurred:
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py", line 35, in test_vectorized_udf_exception
          self.check_vectorized_udf_exception()
        File "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py", line 658, in check_vectorized_udf_exception
          with self.assertRaisesRegex(Exception, "division( or modulo)? by zero"):
      AssertionError: "division( or modulo)? by zero" does not match "
        An exception was thrown from the Python worker. Please see the stack trace below.
      Traceback (most recent call last):
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1834, in main
          process()
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1826, in process
          serializer.dump_stream(out_iter, outfile)
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 531, in dump_stream
          return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 104, in dump_stream
          for batch in iterator:
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 524, in init_stream_yield_batches
          for series in iterator:
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1734, in mapper
          result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1734, in <genexpr>
          result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 146, in <lambda>
          verify_result_length(verify_result_type(func(*a)), len(a[0])),
                                                  ^^^^^^^^
        File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/util.py", line 118, in wrapper
          return f(*args, **kwargs)
                 ^^^^^^^^^^^^^^^^^^
        File "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py", line 650, in <lambda>
          scalar_raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
                                                             ~~^~..."
      ----------------------------------------------------------------------
      

      Attachments

        Issue Links

          Activity

            People

              gurwls223 Hyukjin Kwon
              gurwls223 Hyukjin Kwon
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: