Details
Description
Bucketizer created for multiple columns with parameters splitsArray, inputCols and outputCols can not be loaded after saving it.
The problem is not seen for Bucketizer created for single column.
Code to reproduce
###################################
from pyspark.ml.feature import Bucketizer
df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
bucketizer = Bucketizer(splitsArray= [[-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.1, 1.2, float("inf")]], inputCols=["values", "values"], outputCols=["b1", "b2"])
bucketed = bucketizer.transform(df).collect()
dfb = bucketizer.transform(df)
print(dfb.show())
bucketizerPath = "dbfs:/mnt/S3-Bucket/" + "Bucketizer"
bucketizer.write().overwrite().save(bucketizerPath)
loadedBucketizer = Bucketizer.load(bucketizerPath) #### Failing here
loadedBucketizer.getSplits() == bucketizer.getSplits()
############################################################
The error message is
TypeError: array() argument 1 must be a unicode character, not bytes
BackTrace:
--------------------------------------------------------------------------
TypeError Traceback (most recent call last) <command-3999490> in <module> 15 16 bucketizer.write().overwrite().save(bucketizerPath)
---> 17 loadedBucketizer = Bucketizer.load(bucketizerPath)
18 loadedBucketizer.getSplits() == bucketizer.getSplits()
/databricks/spark/python/pyspark/ml/util.py in load(cls, path)
376 def load(cls, path):
377 """Reads an ML instance from the input path, a shortcut of `read().load(path)`."""
--> 378 return cls.read().load(path)
379
380
/databricks/spark/python/pyspark/ml/util.py in load(self, path)
330 raise NotImplementedError("This Java ML type cannot be loaded into Python currently: %r"
331 % self._clazz)
--> 332 return self._clazz._from_java(java_obj)
333
334
def session(self, sparkSession): /databricks/spark/python/pyspark/ml/wrapper.py in _from_java(java_stage)
258
259 py_stage._resetUid(java_stage.uid())
--> 260 py_stage._transfer_params_from_java()
261 elif hasattr(py_type, "_from_java"):
262 py_stage = py_type._from_java(java_stage)
/databricks/spark/python/pyspark/ml/wrapper.py in _transfer_params_from_java(self)
186 # SPARK-14931: Only check set params back to avoid default params mismatch.
187 if self._java_obj.isSet(java_param): -->
188 value = _java2py(sc, self._java_obj.getOrDefault(java_param))
189 self._set(**
)
190 # SPARK-10931: Temporary fix for params that have a default in Java
/databricks/spark/python/pyspark/ml/common.py in _java2py(sc, r, encoding)
107
108 if isinstance(r, (bytearray, bytes)):
--> 109 r = PickleSerializer().loads(bytes(r), encoding=encoding)
110 return r
111
/databricks/spark/python/pyspark/serializers.py in loads(self, obj, encoding)
467
468 def loads(self, obj, encoding="bytes"):
--> 469 return pickle.loads(obj, encoding=encoding)
470
471
TypeError: array() argument 1 must be a unicode character, not bytes