Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-20093

Exception when Joining dataframe with another dataframe generated by applying groupBy transformation on original one

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Major
    • Resolution: Duplicate
    • 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.2.0
    • None
    • SQL
    • None

    Description

      When we generate a dataframe by doing grouping, and perform join on original dataframe with aggregate column, we get AnalysisException. Below I've attached a piece of code and resulting exception to reproduce.

      Code:

      import org.apache.spark.sql.SparkSession

      object App {

      lazy val spark = SparkSession.builder.appName("Test").master("local").getOrCreate

      def main(args: Array[String]): Unit =

      { test1 }

      private def test1

      { import org.apache.spark.sql.functions._ val df = spark.createDataFrame(Seq(("M",172,60), ("M", 170, 60), ("F", 155, 56), ("M", 160, 55), ("F", 150, 53))).toDF("gender", "height", "weight") val groupDF = df.groupBy("gender").agg(min("height").as("height")) groupDF.show() val out = groupDF.join(df, groupDF("height") <=> df("height")).select(df("gender"), df("height"), df("weight")) out.show }

      }

      When I ran above code, I got below exception:

      Exception in thread "main" org.apache.spark.sql.AnalysisException: resolved attribute(s) height#8 missing from height#19,height#30,gender#29,weight#31,gender#7 in operator !Join Inner, (height#19 <=> height#8);;
      !Join Inner, (height#19 <=> height#8)
      :- Aggregate gender#7, gender#7, min(height#8) AS height#19
      : +- Project _1#0 AS gender#7, _2#1 AS height#8, _3#2 AS weight#9
      : +- LocalRelation _1#0, _2#1, _3#2
      +- Project _1#0 AS gender#29, _2#1 AS height#30, _3#2 AS weight#31
      +- LocalRelation _1#0, _2#1, _3#2

      at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:39)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:90)
      at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:342)
      at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:78)
      at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
      at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:78)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:90)
      at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:53)
      at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:67)
      at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:2831)
      at org.apache.spark.sql.Dataset.join(Dataset.scala:843)
      at org.apache.spark.sql.Dataset.join(Dataset.scala:807)
      at App$.test1(App.scala:17)
      at App$.main(App.scala:9)
      at App.main(App.scala)

      Please someone look into it.

      Attachments

        Issue Links

          Activity

            People

              Unassigned Unassigned
              narahari92 Hosur Narahari
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: