Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-19477

[SQL] Datasets created from a Dataframe with extra columns retain the extra columns



    • Bug
    • Status: Resolved
    • Major
    • Resolution: Incomplete
    • 2.1.0
    • None
    • SQL


      In 1.6, when you created a Dataset from a Dataframe that had extra columns, the columns not in the case class were dropped from the Dataset.

      For example in 1.6, the column c4 is gone:

      scala> case class F(f1: String, f2: String, f3:String)
      defined class F
      scala> import sqlContext.implicits._
      import sqlContext.implicits._
      scala> val df = Seq(("a","b","c","x"), ("d", "e", "f","y"), ("h", "i", "j","z")).toDF("f1", "f2", "f3", "c4")
      df: org.apache.spark.sql.DataFrame = [f1: string, f2: string, f3: string, c4: string]
      scala> val ds = df.as[F]
      ds: org.apache.spark.sql.Dataset[F] = [f1: string, f2: string, f3: string]
      scala> ds.show
      | f1| f2| f3|
      |  a|  b|  c|
      |  d|  e|  f|
      |  h|  i|  j|

      This seems to have changed in Spark 2.0 and also 2.1:

      Spark 2.1.0:

      scala> case class F(f1: String, f2: String, f3:String)
      defined class F
      scala> import spark.implicits._
      import spark.implicits._
      scala> val df = Seq(("a","b","c","x"), ("d", "e", "f","y"), ("h", "i", "j","z")).toDF("f1", "f2", "f3", "c4")
      df: org.apache.spark.sql.DataFrame = [f1: string, f2: string ... 2 more fields]
      scala> val ds = df.as[F]
      ds: org.apache.spark.sql.Dataset[F] = [f1: string, f2: string ... 2 more fields]
      scala> ds.show
      | f1| f2| f3| c4|
      |  a|  b|  c|  x|
      |  d|  e|  f|  y|
      |  h|  i|  j|  z|
      scala> import org.apache.spark.sql.Encoders
      import org.apache.spark.sql.Encoders
      scala> val fEncoder = Encoders.product[F]
      fEncoder: org.apache.spark.sql.Encoder[F] = class[f1[0]: string, f2[0]: string, f3[0]: string]
      scala> fEncoder.schema == ds.schema
      res2: Boolean = false
      scala> ds.schema
      res3: org.apache.spark.sql.types.StructType = StructType(StructField(f1,StringType,true), StructField(f2,StringType,true), StructField(f3,StringType,true), StructField(c4,StringType,true))
      scala> fEncoder.schema
      res4: org.apache.spark.sql.types.StructType = StructType(StructField(f1,StringType,true), StructField(f2,StringType,true), StructField(f3,StringType,true))




            Unassigned Unassigned
            dondrake Don Drake
            2 Vote for this issue
            8 Start watching this issue

