Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-29602

How does the spark from_json json and dataframe transform ignore the case of the json key

    XMLWordPrintableJSON

Details

    • Question
    • Status: Resolved
    • Major
    • Resolution: Invalid
    • 2.4.4
    • None
    • Spark Core

    Description

      How does the spark from_json json and dataframe transform ignore the case of the json key

      code 

      def main(args: Array[String]): Unit = {
        val spark = SparkSession.builder().master("local[*]").
          enableHiveSupport().getOrCreate()
        //    spark.sqlContext.setConf("spark.sql.caseSensitive", "false")
        import spark.implicits._
        //hive table  data Lower case automatically when saving
        val hivetable =
          """{"deliverysystype":"dms","orderid":"B0001-N103-000-005882-RL3AI2RWCP","storeid":103,"timestamp":1571587522000,"aaaa":"dms"}"""
        val hiveDF = Seq(hivetable).toDF("msg")
        val rdd = hiveDF.rdd.map(_.getString(0))
        val jsonDataDF = spark.read.json(rdd.toDS())
        jsonDataDF.show(false)
        //+----+---------------+--------------------------------+-------+-------------+
        //|aaaa|deliverysystype|orderid                         |storeid|timestamp    |
        //+----+---------------+--------------------------------+-------+-------------+
        //|dms |dms            |B0001-N103-000-005882-RL3AI2RWCP|103    |1571587522000|
        //+----+---------------+--------------------------------+-------+-------------+
        val jsonstr =
        """{"data":{"deliverySysType":"dms","orderId":"B0001-N103-000-005882-RL3AI2RWCP","storeId":103,"timestamp":1571587522000},"accessKey":"f9d069861dfb1678","actionName":"candao.rider.getDeliveryInfo","sign":"fa0239c75e065cf43d0a4040665578ba" }"""
        val jsonStrDF = Seq(jsonstr).toDF("msg")
        //转换json数据列 action_name    actionName
        jsonStrDF.show(false)
        val structSeqSchme = StructType(Seq(StructField("data", jsonDataDF.schema, true),
          StructField("accessKey", StringType, true), //这里应该 accessKey
          StructField("actionName", StringType, true),
          StructField("columnNameOfCorruptRecord", StringType, true)
        ))
        //hive col name lower case, json data key capital and small letter,Take less than value
        val mapOption = Map("allowBackslashEscapingAnyCharacter" -> "true", "allowUnquotedControlChars" -> "true", "allowSingleQuotes" -> "true")
        //I'm not doing anything here, but I don't know how to set a value, right?
        val newDF = jsonStrDF.withColumn("data_col", from_json(col("msg"), structSeqSchme, mapOption))
        newDF.show(false)
        newDF.printSchema()
        newDF.select($"data_col.accessKey", $"data_col.actionName", $"data_col.data.*", $"data_col.columnNameOfCorruptRecord").show(false)
        //Lowercase columns do not fetch data. How do you make it ignore lowercase columns?  deliverysystype,storeid-> null
        //+----------------+----------------------------+----+---------------+-------+-------+-------------+-------------------------+
        //|accessKey       |actionName                  |aaaa|deliverysystype|orderid|storeid|timestamp    |columnNameOfCorruptRecord|
        //+----------------+----------------------------+----+---------------+-------+-------+-------------+-------------------------+
        //|f9d069861dfb1678|candao.rider.getDeliveryInfo|null|null           |null   |null   |1571587522000|null                     |
        //+----------------+----------------------------+----+---------------+-------+-------+-------------+-------------------------+
      }
      

      Attachments

        Activity

          People

            Unassigned Unassigned
            ruilaing ruiliang
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved:

              Time Tracking

                Estimated:
                Original Estimate - 12h
                12h
                Remaining:
                Remaining Estimate - 12h
                12h
                Logged:
                Time Spent - Not Specified
                Not Specified