Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-24033

LAG Window function broken in Spark 2.3

    XMLWordPrintableJSON

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: 2.3.0
    • Fix Version/s: 2.3.1, 2.4.0
    • Component/s: SQL
    • Labels:
      None

      Description

      The LAG window function appears to be broken in Spark 2.3.0, always failing with an AnalysisException. Interestingly, LEAD is not affected, so it can be worked around by negating the lag and using lead instead.

      Reproduction (run in spark-shell):

      import org.apache.spark.sql.expressions.Window
      val ds = Seq((1,1),(1,2),(1,3),(2,1),(2,2)).toDF("n", "i")
      // The following works:
      ds.withColumn("m", lead("i", -1).over(Window.partitionBy("n").orderBy("i").rowsBetween(-1, -1))).show
      // The following (equivalent) fails:
      ds.withColumn("m", lag("i", 1).over(Window.partitionBy("n").orderBy("i").rowsBetween(-1, -1))).show
      

      Here is the stacktrace:

      org.apache.spark.sql.AnalysisException: Window Frame specifiedwindowframe(RowFrame, -1, -1) must match the required frame specifiedwindowframe(RowFrame, -1, -1);
      at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:41)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:91)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31$$anonfun$applyOrElse$10.applyOrElse(Analyzer.scala:2034)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31$$anonfun$applyOrElse$10.applyOrElse(Analyzer.scala:2030)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
      at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
      at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsDown$1.apply(QueryPlan.scala:85)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsDown$1.apply(QueryPlan.scala:85)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:106)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:116)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$1.apply(QueryPlan.scala:120)
      at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at scala.collection.immutable.List.foreach(List.scala:381)
      at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
      at scala.collection.immutable.List.map(List.scala:285)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:120)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:125)
      at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:125)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsDown(QueryPlan.scala:85)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressions(QueryPlan.scala:76)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31.applyOrElse(Analyzer.scala:2030)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31.applyOrElse(Analyzer.scala:2029)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:256)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$.apply(Analyzer.scala:2029)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$.apply(Analyzer.scala:2028)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:87)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:84)
      at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
      at scala.collection.immutable.List.foldLeft(List.scala:84)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:84)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:76)
      at scala.collection.immutable.List.foreach(List.scala:381)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:76)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:123)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:117)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:102)
      at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
      at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
      at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
      at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:74)
      at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3295)
      at org.apache.spark.sql.Dataset.select(Dataset.scala:1307)
      at org.apache.spark.sql.Dataset.withColumns(Dataset.scala:2192)
      at org.apache.spark.sql.Dataset.withColumn(Dataset.scala:2159)
      ... 49 elided

        Attachments

          Activity

            People

            • Assignee:
              smilegator Xiao Li
              Reporter:
              emlyn Emlyn Corrin
            • Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: