Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-24033

LAG Window function broken in Spark 2.3

Attach filesAttach ScreenshotVotersWatch issueWatchersCreate sub-taskLinkCloneUpdate Comment AuthorReplace String in CommentUpdate Comment VisibilityDelete Comments
    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Major
    • Resolution: Fixed
    • 2.3.0
    • 2.3.1, 2.4.0
    • SQL
    • None

    Description

      The LAG window function appears to be broken in Spark 2.3.0, always failing with an AnalysisException. Interestingly, LEAD is not affected, so it can be worked around by negating the lag and using lead instead.

      Reproduction (run in spark-shell):

      import org.apache.spark.sql.expressions.Window
      val ds = Seq((1,1),(1,2),(1,3),(2,1),(2,2)).toDF("n", "i")
      // The following works:
      ds.withColumn("m", lead("i", -1).over(Window.partitionBy("n").orderBy("i").rowsBetween(-1, -1))).show
      // The following (equivalent) fails:
      ds.withColumn("m", lag("i", 1).over(Window.partitionBy("n").orderBy("i").rowsBetween(-1, -1))).show
      

      Here is the stacktrace:

      org.apache.spark.sql.AnalysisException: Window Frame specifiedwindowframe(RowFrame, -1, -1) must match the required frame specifiedwindowframe(RowFrame, -1, -1);
      at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:41)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:91)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31$$anonfun$applyOrElse$10.applyOrElse(Analyzer.scala:2034)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31$$anonfun$applyOrElse$10.applyOrElse(Analyzer.scala:2030)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
      at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
      at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsDown$1.apply(QueryPlan.scala:85)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsDown$1.apply(QueryPlan.scala:85)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:106)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:116)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$1.apply(QueryPlan.scala:120)
      at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at scala.collection.immutable.List.foreach(List.scala:381)
      at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
      at scala.collection.immutable.List.map(List.scala:285)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:120)
      at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:125)
      at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:125)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsDown(QueryPlan.scala:85)
      at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressions(QueryPlan.scala:76)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31.applyOrElse(Analyzer.scala:2030)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$$anonfun$apply$31.applyOrElse(Analyzer.scala:2029)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
      at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
      at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:256)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$.apply(Analyzer.scala:2029)
      at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame$.apply(Analyzer.scala:2028)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:87)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:84)
      at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
      at scala.collection.immutable.List.foldLeft(List.scala:84)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:84)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:76)
      at scala.collection.immutable.List.foreach(List.scala:381)
      at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:76)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:123)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:117)
      at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:102)
      at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
      at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
      at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
      at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:74)
      at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3295)
      at org.apache.spark.sql.Dataset.select(Dataset.scala:1307)
      at org.apache.spark.sql.Dataset.withColumns(Dataset.scala:2192)
      at org.apache.spark.sql.Dataset.withColumn(Dataset.scala:2159)
      ... 49 elided

      Attachments

        Activity

          This comment will be Viewable by All Users Viewable by All Users
          Cancel

          People

            smilegator Xiao Li
            emlyn Emlyn Corrin
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved:

              Slack

                Issue deployment