Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-21281

cannot create empty typed array column

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Minor
    • Resolution: Fixed
    • Affects Version/s: 2.1.1
    • Fix Version/s: 2.3.0
    • Component/s: SQL
    • Labels:
      None

      Description

      Hi all

      I am running this piece of code

      val data = spark.read.parquet("somedata.parquet")
      data.withColumn("my_new_column", array().cast("array<string>")).show
      

      and it works fine

      +------+---------+--------------------+---+
      |itemid|sentiment|                text|my_new_column|
      +------+---------+--------------------+---+
      |     1|        0|                 ...| []|
      |     2|        0|                 ...| []|
      |     3|        1|              omg...| []|
      |     4|        0|          .. Omga...| []|
      

      but when I do

      val data = spark.read.parquet("somedata.parquet")
      import org.apache.spark.sql.types._
      data.withColumn("my_new_column", array().cast("array<int>").show
      

      I get:

      scala.MatchError: NullType (of class org.apache.spark.sql.types.NullType$)
        at org.apache.spark.sql.catalyst.expressions.Cast.castToInt(Cast.scala:264)
        at org.apache.spark.sql.catalyst.expressions.Cast.org$apache$spark$sql$catalyst$expressions$Cast$$cast(Cast.scala:433)
        at org.apache.spark.sql.catalyst.expressions.Cast.castArray(Cast.scala:380)
        at org.apache.spark.sql.catalyst.expressions.Cast.org$apache$spark$sql$catalyst$expressions$Cast$$cast(Cast.scala:437)
        at org.apache.spark.sql.catalyst.expressions.Cast.cast$lzycompute(Cast.scala:447)
        at org.apache.spark.sql.catalyst.expressions.Cast.cast(Cast.scala:447)
        at org.apache.spark.sql.catalyst.expressions.Cast.nullSafeEval(Cast.scala:449)
        at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:325)
        at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:50)
        at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:43)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
        at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:287)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionDown$1(QueryPlan.scala:248)
        at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:258)
        at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$1.apply(QueryPlan.scala:262)
        at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
        at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
        at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
        at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
        at scala.collection.AbstractTraversable.map(Traversable.scala:104)
        at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:262)
        at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$6.apply(QueryPlan.scala:267)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
        at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsDown(QueryPlan.scala:267)
        at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1.applyOrElse(expressions.scala:43)
        at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1.applyOrElse(expressions.scala:42)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
        at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:287)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:277)
        at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$.apply(expressions.scala:42)
        at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$.apply(expressions.scala:41)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
        at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
        at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
        at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
        at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:73)
        at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:73)
        at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:79)
        at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:75)
        at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:84)
        at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:84)
        at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2791)
        at org.apache.spark.sql.Dataset.head(Dataset.scala:2112)
        at org.apache.spark.sql.Dataset.take(Dataset.scala:2327)
        at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)
        at org.apache.spark.sql.Dataset.show(Dataset.scala:636)
        at org.apache.spark.sql.Dataset.show(Dataset.scala:595)
        at org.apache.spark.sql.Dataset.show(Dataset.scala:604)
        ... 50 elided
      

        Attachments

          Activity

            People

            • Assignee:
              maropu Takeshi Yamamuro
              Reporter:
              revolucion09 Saif Addin
            • Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: