Details
-
Bug
-
Status: Resolved
-
Major
-
Resolution: Not A Problem
-
2.0.0
-
None
-
None
Description
Example shell for repro:
scala> val df =spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/home/holden/Downloads/ex*.csv")
df: org.apache.spark.sql.DataFrame = [Date: string, Lifetime Total Likes: int ... 2125 more fields]scala> df.schema
res0: org.apache.spark.sql.types.StructType = StructType(StructField(Date,StringType,true), StructField(Lifetime Total Likes,IntegerType,true), StructField(Daily New Likes,IntegerType,true), StructField(Daily Unlikes,IntegerType,true), StructField(Daily Page Engaged Users,IntegerType,true), StructField(Weekly Page Engaged Users,IntegerType,true), StructField(28 Days Page Engaged Users,IntegerType,true), StructField(Daily Like Sources - On Your Page,IntegerType,true), StructField(Daily Total Reach,IntegerType,true), StructField(Weekly Total Reach,IntegerType,true), StructField(28 Days Total Reach,IntegerType,true), StructField(Daily Organic Reach,IntegerType,true), StructField(Weekly Organic Reach,IntegerType,true), StructField(28 Days Organic Reach,IntegerType,true), StructField(Daily T...scala> df.take(1)
[GIANT LIST OF COLUMNS]
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:133)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:129)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at org.apache.spark.sql.types.StructType.map(StructType.scala:95)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:129)
at org.apache.spark.sql.execution.datasources.FileSourceStrategy$.apply(FileSourceStrategy.scala:87)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:61)
at org.apache.spark.sql.execution.SparkPlanner.plan(SparkPlanner.scala:47)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:51)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:48)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:300)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:321)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:179)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:319)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:298)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:78)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:76)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:83)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:83)
at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2558)
at org.apache.spark.sql.Dataset.head(Dataset.scala:1924)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2139)
... 48 elided
Interestingly enough attempting to access row by index also fails in column resolution phase or converting to an RDD also fails.
Loading without header on succeeds.
csv file for repro (on dropbox): https://www.dropbox.com/s/f8453txcej43mz4/example_facebook.csv?dl=0