Uploaded image for project: 'Apache Hudi'
  1. Apache Hudi
  2. HUDI-3065

spark auto partition discovery does not work from 0.9.0

    XMLWordPrintableJSON

Details

    • 2

    Description

      with 0.8.0, if partition is of the format  "/partitionKey=partitionValue", Spark auto partition discovery will kick in. we can see explicit fields in hudi's table schema. 

      But with 0.9.0, it does not happen. 

      // launch spark shell with 0.8.0 

      import org.apache.hudi.QuickstartUtils._
      import scala.collection.JavaConversions._
      import org.apache.spark.sql.SaveMode._
      import org.apache.hudi.DataSourceReadOptions._
      import org.apache.hudi.DataSourceWriteOptions._
      import org.apache.hudi.config.HoodieWriteConfig._
      val tableName = "hudi_trips_cow"
      val basePath = "file:///tmp/hudi_trips_cow"
      val dataGen = new DataGenerator
      
      val inserts = convertToStringList(dataGen.generateInserts(10))
      val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
      val newDf = df.withColumn("partitionpath", regexp_replace($"partitionpath", "(.*)(\\/){1}(.*)(\\/){1}", "continent=$1$2country=$3$4city="))
      
      newDf.write.format("hudi").
      options(getQuickstartWriteConfigs).
      option(PRECOMBINE_FIELD_OPT_KEY, "ts").
      option(RECORDKEY_FIELD_OPT_KEY, "uuid").
      option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
      option(TABLE_NAME, tableName).
      mode(Overwrite).save(basePath)
      
      val tripsSnapshotDF = spark.
              read.
              format("hudi").
              load(basePath)
      
      tripsSnapshotDF.printSchema
      

      // output : check for continent, country, city in the end.

      |– _hoodie_commit_time: string (nullable = true)|
      
       |-- _hoodie_commit_seqno: string (nullable = true)
      
       |-- _hoodie_record_key: string (nullable = true)
      
       |-- _hoodie_partition_path: string (nullable = true)
      
       |-- _hoodie_file_name: string (nullable = true)
      
       |-- begin_lat: double (nullable = true)
      
       |-- begin_lon: double (nullable = true)
      
       |-- driver: string (nullable = true)
      
       |-- end_lat: double (nullable = true)
      
       |-- end_lon: double (nullable = true)
      
       |-- fare: double (nullable = true)
      
       |-- partitionpath: string (nullable = true)
      
       |-- rider: string (nullable = true)
      
       |-- ts: long (nullable = true)
      
       |-- uuid: string (nullable = true)
      
       |-- continent: string (nullable = true)
      
       |-- country: string (nullable = true)
      
       |-- city: string (nullable = true)
      
       

       

      Lets run this with 0.9.0.

      import org.apache.hudi.QuickstartUtils._
      import scala.collection.JavaConversions._
      import org.apache.spark.sql.SaveMode._
      import org.apache.hudi.DataSourceReadOptions._
      import org.apache.hudi.DataSourceWriteOptions._
      import org.apache.hudi.config.HoodieWriteConfig._
      val tableName = "hudi_trips_cow"
      val basePath = "file:///tmp/hudi_trips_cow"
      val dataGen = new DataGenerator
      
      val inserts = convertToStringList(dataGen.generateInserts(10))
      val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
      val newDf = df.withColumn("partitionpath", regexp_replace($"partitionpath", "(.*)(\\/){1}(.*)(\\/){1}", "continent=$1$2country=$3$4city="))
      
      newDf.write.format("hudi").  
      options(getQuickstartWriteConfigs).  
      option(PRECOMBINE_FIELD_OPT_KEY, "ts").  
      option(RECORDKEY_FIELD_OPT_KEY, "uuid").  
      option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").  
      option(TABLE_NAME, tableName).  
      mode(Overwrite).  save(basePath)
      
      val tripsSnapshotDF = spark.
           |   read.
           |   format("hudi").
           |   load(basePath )
      
      tripsSnapshotDF.printSchema
      
      

      //output: continent, country, city is missing. 

      root
      
       |-- _hoodie_commit_time: string (nullable = true)
      
       |-- _hoodie_commit_seqno: string (nullable = true)
      
       |-- _hoodie_record_key: string (nullable = true)
      
       |-- _hoodie_partition_path: string (nullable = true)
      
       |-- _hoodie_file_name: string (nullable = true)
      
       |-- begin_lat: double (nullable = true)
      
       |-- begin_lon: double (nullable = true)
      
       |-- driver: string (nullable = true)
      
       |-- end_lat: double (nullable = true)
      
       |-- end_lon: double (nullable = true)
      
       |-- fare: double (nullable = true)
      
       |-- rider: string (nullable = true)
      
       |-- ts: long (nullable = true)
      
       |-- uuid: string (nullable = true)
      
       |-- partitionpath: string (nullable = true)
      
       

      Ref issue: https://github.com/apache/hudi/issues/3984

       

       

       

       

      Attachments

        Issue Links

          Activity

            People

              biyan900116@gmail.com Yann Byron
              shivnarayan sivabalan narayanan
              Forward Xu, Shiyan Xu
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: