Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-35320

from_json cannot parse maps with timestamp as key

    XMLWordPrintableJSON

Details

    • Improvement
    • Status: Resolved
    • Minor
    • Resolution: Fixed
    • 3.0.1, 3.1.1
    • 3.3.0
    • SQL
    • None
      • Java 11
      • Spark 3.0.1/3.1.1
      • Scala 2.12

    Description

      I have a json that contains a map<timestamp,string> like the following

      {
        "map": {
          "2021-05-05T20:05:08": "sampleValue"
        }
      }
      

      The key of the map is a string containing a formatted timestamp and I want to parse it as a Java {{Map<Instant,String>}} using the {{from_json}} Spark SQL function (see the Sample class in the code below).

      import org.apache.spark.sql.Dataset;
      import org.apache.spark.sql.Encoders;
      import org.apache.spark.sql.Row;
      import org.apache.spark.sql.SparkSession;
      
      import java.io.Serializable;
      import java.time.Instant;
      import java.util.List;
      import java.util.Map;
      
      import static org.apache.spark.sql.functions.*;
      
      public class TimestampAsJsonMapKey {
      
          public static class Sample implements Serializable {
              private Map<Instant, String> map;
              
              public Map<Instant, String> getMap() {
                  return map;
              }
              
              public void setMap(Map<Instant, String> map) {
                  this.map = map;
              }
          }
      
          public static class InvertedSample implements Serializable {
              private Map<String, Instant> map;
              
              public Map<String, Instant> getMap() {
                  return map;
              }
              
              public void setMap(Map<String, Instant> map) {
                  this.map = map;
              }
          }
      
          public static void main(String[] args) {
      
              final SparkSession spark = SparkSession
                      .builder()
                      .appName("Timestamp As Json Map Key Test")
                      .master("local[1]")
                      .getOrCreate();
      
              workingTest(spark);
      
              notWorkingTest(spark);
      
          }
      
          private static void workingTest(SparkSession spark) {
              //language=JSON
              final String invertedSampleJson = "{ \"map\": { \"sampleValue\": \"2021-05-05T20:05:08\" } }";
      
              final Dataset<String> samplesDf = spark.createDataset(List.of(invertedSampleJson), Encoders.STRING());
      
              final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(InvertedSample.class).schema()));
      
              parsedDf.show(false);
          }
      
          private static void notWorkingTest(SparkSession spark) {
              //language=JSON
              final String sampleJson = "{ \"map\": { \"2021-05-05T20:05:08\": \"sampleValue\" } }";
      
              final Dataset<String> samplesDf = spark.createDataset(List.of(sampleJson), Encoders.STRING());
      
              final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(Sample.class).schema()));
      
              parsedDf.show(false);
          }
      }
      

      When I run the notWorkingTest method it fails with the following exception:

      Exception in thread "main" java.lang.ClassCastException: class org.apache.spark.unsafe.types.UTF8String cannot be cast to class java.lang.Long (org.apache.spark.unsafe.types.UTF8String is in unnamed module of loader 'app'; java.lang.Long is in module java.base of loader 'bootstrap')
      	at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:107)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$8$adapted(Cast.scala:297)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$7(Cast.scala:297)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$12(Cast.scala:329)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$11(Cast.scala:321)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$14(Cast.scala:359)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$13(Cast.scala:352)
      	at org.apache.spark.sql.catalyst.expressions.CastBase.nullSafeEval(Cast.scala:815)
      	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:461)
      	at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:156)
      	at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(InterpretedMutableProjection.scala:83)
      	at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$17.$anonfun$applyOrElse$71(Optimizer.scala:1508)
      

      It seems that if the a timestamp is the key in a map it must necessarily be a of type long, and cannot be of type string.

       


       In the workingTest method, instead, I have an inverted map (the timestamp appears as the value in this case, and not as the key) and it works correctly

      Attachments

        Activity

          People

            planga82 Pablo Langa Blanco
            vincenzo.c Vincenzo Cerminara
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: