Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-4850

"GROUP BY" can't work if the schema of SchemaRDD contains struct or array type

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Major
    • Resolution: Invalid
    • 1.1.0, 1.1.1, 1.1.2, 1.2.0
    • None
    • SQL

    Description

      Code in Spark Shell as follows:

      val sqlContext = new org.apache.spark.sql.SQLContext(sc)
      val path = "path/to/json"
      sqlContext.jsonFile(path).register("Table")
      val t = sqlContext.sql("select * from Table group by a")
      t.collect
      

      Let's look into the schema of `Table`

      root
       |-- a: integer (nullable = true)
       |-- arr: array (nullable = true)
       |    |-- element: integer (containsNull = false)
       |-- createdAt: string (nullable = true)
       |-- f: struct (nullable = true)
       |    |-- __type: string (nullable = true)
       |    |-- className: string (nullable = true)
       |    |-- objectId: string (nullable = true)
       |-- objectId: string (nullable = true)
       |-- s: string (nullable = true)
       |-- updatedAt: string (nullable = true)
      

      Exception will be throwed:

      org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Expression not in GROUP BY: arr#9, tree:
      Aggregate [a#8], [a#8,arr#9,createdAt#10,f#11,objectId#12,s#13,updatedAt#14]
       Subquery TestImport
        LogicalRDD [a#8,arr#9,createdAt#10,f#11,objectId#12,s#13,updatedAt#14], MappedRDD[18] at map at JsonRDD.scala:47
      
      	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$7.apply(Analyzer.scala:126)
      	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$7.apply(Analyzer.scala:125)
      	at scala.Option.foreach(Option.scala:236)
      	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:125)
      	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:108)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
      	at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
      	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:108)
      	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:106)
      	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
      	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
      	at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
      	at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
      	at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
      	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
      	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
      	at scala.collection.immutable.List.foreach(List.scala:318)
      	at org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
      	at org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:411)
      	at org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:411)
      	at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:412)
      	at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:412)
      	at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:413)
      	at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:413)
      	at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
      	at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
      	at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
      	at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
      	at org.apache.spark.sql.SchemaRDD.collect(SchemaRDD.scala:444)
      	at $iwC$$iwC$$iwC$$iwC.<init>(<console>:17)
      	at $iwC$$iwC$$iwC.<init>(<console>:22)
      	at $iwC$$iwC.<init>(<console>:24)
      	at $iwC.<init>(<console>:26)
      	at <init>(<console>:28)
      	at .<init>(<console>:32)
      	at .<clinit>(<console>)
      	at .<init>(<console>:7)
      	at .<clinit>(<console>)
      	at $print(<console>)
      	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
      	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
      	at java.lang.reflect.Method.invoke(Method.java:597)
      	at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)
      	at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)
      	at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)
      	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)
      	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)
      	at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:828)
      	at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
      	at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:785)
      	at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:628)
      	at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:636)
      	at org.apache.spark.repl.SparkILoop.loop(SparkILoop.scala:641)
      	at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply$mcZ$sp(SparkILoop.scala:968)
      	at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:916)
      	at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:916)
      	at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
      	at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:916)
      	at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1011)
      	at org.apache.spark.repl.Main$.main(Main.scala:31)
      	at org.apache.spark.repl.Main.main(Main.scala)
      	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
      	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
      	at java.lang.reflect.Method.invoke(Method.java:597)
      	at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:358)
      	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
      	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
      
      

      Attachments

        Activity

          People

            lian cheng Cheng Lian
            debugger87 Chaozhong Yang
            Cheng Lian Cheng Lian
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved:

              Time Tracking

                Estimated:
                Original Estimate - 96h
                96h
                Remaining:
                Remaining Estimate - 96h
                96h
                Logged:
                Time Spent - Not Specified
                Not Specified