  1. Spark
  2. SPARK-35320

from_json cannot parse maps with timestamp as key



    • 3.0.1, 3.1.1
    • 3.3.0
    • SQL
      • Java 11
      • Spark 3.0.1/3.1.1
      • Scala 2.12


      I have a json that contains a map<timestamp,string> like the following

        "map": {
          "2021-05-05T20:05:08": "sampleValue"

      The key of the map is a string containing a formatted timestamp and I want to parse it as a Java {{Map<Instant,String>}} using the {{from_json}} Spark SQL function (see the Sample class in the code below).

      import org.apache.spark.sql.Dataset;
      import org.apache.spark.sql.Encoders;
      import org.apache.spark.sql.Row;
      import org.apache.spark.sql.SparkSession;
      import java.io.Serializable;
      import java.time.Instant;
      import java.util.List;
      import java.util.Map;
      import static org.apache.spark.sql.functions.*;
      public class TimestampAsJsonMapKey {
          public static class Sample implements Serializable {
              private Map<Instant, String> map;
              public Map<Instant, String> getMap() {
                  return map;
              public void setMap(Map<Instant, String> map) {
                  this.map = map;
          public static class InvertedSample implements Serializable {
              private Map<String, Instant> map;
              public Map<String, Instant> getMap() {
                  return map;
              public void setMap(Map<String, Instant> map) {
                  this.map = map;
          public static void main(String[] args) {
              final SparkSession spark = SparkSession
                      .appName("Timestamp As Json Map Key Test")
          private static void workingTest(SparkSession spark) {
              final String invertedSampleJson = "{ \"map\": { \"sampleValue\": \"2021-05-05T20:05:08\" } }";
              final Dataset<String> samplesDf = spark.createDataset(List.of(invertedSampleJson), Encoders.STRING());
              final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(InvertedSample.class).schema()));
          private static void notWorkingTest(SparkSession spark) {
              final String sampleJson = "{ \"map\": { \"2021-05-05T20:05:08\": \"sampleValue\" } }";
              final Dataset<String> samplesDf = spark.createDataset(List.of(sampleJson), Encoders.STRING());
              final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(Sample.class).schema()));

      When I run the notWorkingTest method it fails with the following exception:

      Exception in thread "main" java.lang.ClassCastException: class org.apache.spark.unsafe.types.UTF8String cannot be cast to class java.lang.Long (org.apache.spark.unsafe.types.UTF8String is in unnamed module of loader 'app'; java.lang.Long is in module java.base of loader 'bootstrap')
      	at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:107)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$8$adapted(Cast.scala:297)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$7(Cast.scala:297)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$12(Cast.scala:329)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$11(Cast.scala:321)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$14(Cast.scala:359)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$13(Cast.scala:352)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.nullSafeEval(Cast.scala:815)
      	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:461)
      	at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:156)
      	at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(InterpretedMutableProjection.scala:83)
      	at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$17.$anonfun$applyOrElse$71(Optimizer.scala:1508)

      It seems that if the a timestamp is the key in a map it must necessarily be a of type long, and cannot be of type string.


       In the workingTest method, instead, I have an inverted map (the timestamp appears as the value in this case, and not as the key) and it works correctly




