Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-20804

Join with null safe equality fails with AnalysisException

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Minor
    • Resolution: Duplicate
    • 2.2.0
    • None
    • SQL
    • None
    • org.apache.spark#spark-sql_2.11;2.3.0-SNAPSHOT from asf snapshots, Mon May 15 08:09:18 EDT 2017

    Description

      val x = Seq(("a", 1), ("a", 2), (null, 1)).toDF("k", "v")
      val sums = x.groupBy($"k").agg(sum($"v") as "sum")
      x
        .join(sums, x("k") <=> sums("k"))
        .drop(sums("k"))
        .show
      

      gives:

        org.apache.spark.sql.AnalysisException: Detected cartesian product for INNER join between logical plans
      Project [_2#54 AS v#57]
      +- LocalRelation [_1#53, _2#54]
      and
      Aggregate [k#69], [k#69, sum(cast(v#70 as bigint)) AS sum#65L]
      +- Project [_1#53 AS k#69, _2#54 AS v#70]
         +- LocalRelation [_1#53, _2#54]
      Join condition is missing or trivial.
      Use the CROSS JOIN syntax to allow cartesian products between these relations.;
        at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts$$anonfun$apply$20.applyOrElse(Optimizer.scala:1081)
        at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts$$anonfun$apply$20.applyOrElse(Optimizer.scala:1078)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
        at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
        at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:256)
        at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts.apply(Optimizer.scala:1078)
        at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts.apply(Optimizer.scala:1063)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
        at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
        at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
        at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
        at scala.collection.immutable.List.foreach(List.scala:381)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
        at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:79)
        at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:79)
        at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:85)
        at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:81)
        at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:90)
        at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:90)
        at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2901)
        at org.apache.spark.sql.Dataset.head(Dataset.scala:2238)
        at org.apache.spark.sql.Dataset.take(Dataset.scala:2451)
        at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)
        at org.apache.spark.sql.Dataset.show(Dataset.scala:680)
        at org.apache.spark.sql.Dataset.show(Dataset.scala:639)
        at org.apache.spark.sql.Dataset.show(Dataset.scala:648)
      

      but this works fine:

      val x = Seq(("a", 1), ("a", 2), (null, 1)).toDF("k", "v")
      val sums = x.select($"k" as "k1", $"v").groupBy($"k1").agg(sum($"v") as "sum")
      x
        .join(sums, x("k") <=> sums("k1"))
        .drop(sums("k1"))
        .show
      +----+---+---+                                                                  
      |   k|  v|sum|
      +----+---+---+
      |   a|  1|  3|
      |   a|  2|  3|
      |null|  1|  1|
      +----+---+---+
      

      Attachments

        Issue Links

          Activity

            People

              Unassigned Unassigned
              koert koert kuipers
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: