Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Not A Problem
-
3.0.1
-
None
-
None
Description
I'm trying to use to_date on a string formatted as "10/31/20".
Expected output is "2020-10-31".
Actual output is "0020-01-31".
The documentation suggests 2020 or 20 as input for "y".
Example below. Expected behaviour is included in the udf.
import java.sql.Date import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{to_date, udf} object ToDate { val toDate = udf((date: String) => { val split = date.split("/") val month = "%02d".format(split(0).toInt) val day = "%02d".format(split(1).toInt) val year = split(2).toInt + 2000 Date.valueOf(s"${year}-${month}-${day}") }) def main(args: Array[String]): Unit = { val spark = SparkSession.builder().master("local[2]").getOrCreate() spark.sparkContext.setLogLevel("ERROR") import spark.implicits._ Seq("1/1/20", "10/31/20") .toDF("raw") .withColumn("to_date", to_date($"raw", "m/d/y")) .withColumn("udf", toDate($"raw")) .show } }