Details
Description
I have a json that contains a map<timestamp,string> like the following
{ "map": { "2021-05-05T20:05:08": "sampleValue" } }
The key of the map is a string containing a formatted timestamp and I want to parse it as a Java {{Map<Instant,String>}} using the {{from_json}} Spark SQL function (see the Sample class in the code below).
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import java.io.Serializable; import java.time.Instant; import java.util.List; import java.util.Map; import static org.apache.spark.sql.functions.*; public class TimestampAsJsonMapKey { public static class Sample implements Serializable { private Map<Instant, String> map; public Map<Instant, String> getMap() { return map; } public void setMap(Map<Instant, String> map) { this.map = map; } } public static class InvertedSample implements Serializable { private Map<String, Instant> map; public Map<String, Instant> getMap() { return map; } public void setMap(Map<String, Instant> map) { this.map = map; } } public static void main(String[] args) { final SparkSession spark = SparkSession .builder() .appName("Timestamp As Json Map Key Test") .master("local[1]") .getOrCreate(); workingTest(spark); notWorkingTest(spark); } private static void workingTest(SparkSession spark) { //language=JSON final String invertedSampleJson = "{ \"map\": { \"sampleValue\": \"2021-05-05T20:05:08\" } }"; final Dataset<String> samplesDf = spark.createDataset(List.of(invertedSampleJson), Encoders.STRING()); final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(InvertedSample.class).schema())); parsedDf.show(false); } private static void notWorkingTest(SparkSession spark) { //language=JSON final String sampleJson = "{ \"map\": { \"2021-05-05T20:05:08\": \"sampleValue\" } }"; final Dataset<String> samplesDf = spark.createDataset(List.of(sampleJson), Encoders.STRING()); final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(Sample.class).schema())); parsedDf.show(false); } }
When I run the notWorkingTest method it fails with the following exception:
Exception in thread "main" java.lang.ClassCastException: class org.apache.spark.unsafe.types.UTF8String cannot be cast to class java.lang.Long (org.apache.spark.unsafe.types.UTF8String is in unnamed module of loader 'app'; java.lang.Long is in module java.base of loader 'bootstrap') at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:107) at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$8$adapted(Cast.scala:297) at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285) at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$7(Cast.scala:297) at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$12(Cast.scala:329) at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285) at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$11(Cast.scala:321) at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$14(Cast.scala:359) at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285) at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$13(Cast.scala:352) at org.apache.spark.sql.catalyst.expressions.CastBase.nullSafeEval(Cast.scala:815) at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:461) at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:156) at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(InterpretedMutableProjection.scala:83) at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$17.$anonfun$applyOrElse$71(Optimizer.scala:1508)
It seems that if the a timestamp is the key in a map it must necessarily be a of type long, and cannot be of type string.
In the workingTest method, instead, I have an inverted map (the timestamp appears as the value in this case, and not as the key) and it works correctly