Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-17384

SQL - Running query with outer join from 1.6 fails

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Major
    • Resolution: Duplicate
    • 2.0.0
    • None
    • SQL
    • None

    Description

      I have some complex (10-table joins) SQL queries that utilize outer joins that work fine in Spark 1.6.2, but fail under Spark 2.0. I was able to duplicate the problem using a simple test case.

      Here's the code for Spark 2.0 that doesn't run (this runs fine in Spark 1.6.2):

      case class C1(f1: String, f2: String, f3: String, f4: String)
      case class C2(g1: String, g2: String, g3: String, g4: String)
      case class C3(h1: String, h2: String, h3: String, h4: String)
      
      val sqlContext = spark.sqlContext 
      
      val c1 = sc.parallelize(Seq(
        C1("h1", "c1a1", "c1b1", "c1c1"),
        C1("h2", "c1a2", "c1b2", "c1c2"),
        C1(null, "c1a3", "c1b3", "c1c3")
        )).toDF
      c1.createOrReplaceTempView("c1")
      
      val c2 = sc.parallelize(Seq(
        C2("h1", "c2a1", "c2b1", "c2c1"),
        C2("h2", "c2a2", "c2b2", "c2c2"),
        C2(null, "c2a3", "c2b3", "c2c3"),
        C2(null, "c2a4", "c2b4", "c2c4"),
        C2("h333", "c2a333", "c2b333", "c2c333")
        )).toDF
      c2.createOrReplaceTempView("c2")
      
      val c3 = sc.parallelize(Seq(
        C3("h1", "c3a1", "c3b1", "c3c1"),
        C3("h2", "c3a2", "c3b2", "c3c2"),
        C3(null, "c3a3", "c3b3", "c3c3")
        )).toDF
      c3.createOrReplaceTempView("c3")
      
      // doesn't work in Spark 2.0, works in Spark 1.6
      val bad_df = sqlContext.sql("""
        select * 
        from c1, c3
        left outer join c2 on (c1.f1 = c2.g1)
        where c1.f1 = c3.h1
      """).show()
      
      // works in both
      val works_df = sqlContext.sql("""
        select * 
        from c1
        left outer join c2 on (c1.f1 = c2.g1), 
        c3
        where c1.f1 = c3.h1
      """).show()
      

      Here's the output after running bad_df in Spark 2.0:

      scala> val bad_df = sqlContext.sql("""
           |   select *
           |   from c1, c3
           |   left outer join c2 on (c1.f1 = c2.g1)
           |   where c1.f1 = c3.h1
           | """).show()
      org.apache.spark.sql.AnalysisException: cannot resolve '`c1.f1`' given input columns: [h3, g3, h4, g2, g4, h2, h1, g1]; line 4 pos 25
        at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
        at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:77)
        at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
        at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:300)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:321)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:179)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:319)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:298)
        at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionUp$1(QueryPlan.scala:190)
        at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:201)
        at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$5.apply(QueryPlan.scala:209)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:179)
        at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:209)
        at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:74)
        at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:67)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125)
        at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:67)
        at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:58)
        at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
        at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
        at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:582)
        at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:682)
        ... 53 elided
      
      scala>
      

      I confirmed this fails on the Spark 2.0 nightly build as well. This runs just fine in Spark 1.6.2.

      Attachments

        Issue Links

          Activity

            People

              hvanhovell Herman van Hövell
              dondrake Don Drake
              Michael Armbrust Michael Armbrust
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: