In [20]: df1 = spark.range(10).withColumn("a", sf.lit(0))
In [21]: df2 = spark.range(10).withColumn("a", sf.lit(0)).withColumnRenamed("a", "b")
In [22]: df1.join(df2, df1["a"] == df2["b"])
Out[22]: DataFrame[id: bigint, a: int, id: bigint, b: int]
In [23]: df1.join(df2, df1["a"] == df2["b"]).drop("b")
Out[23]: DataFrame[id: bigint, a: int, id: bigint]
In [24]: df1 = spark.range(10).withColumn("a", sf.lit(0))
In [25]: df2 = df1.withColumnRenamed("a", "b")
In [26]: df1.join(df2, df1["a"] == df2["b"])
Out[26]: 23/12/22 09:33:28 ERROR ErrorUtils: Spark Connect RPC error during: analyze. UserId: ruifeng.zheng. SessionId: eaa2161f-4b64-4dbf-9809-af6b696d3005.
org.apache.spark.sql.AnalysisException: [AMBIGUOUS_COLUMN_REFERENCE] Column a is ambiguous. It's because you joined several DataFrame together, and some of these DataFrames are the same.
This column points to one of the DataFrame but Spark is unable to figure out which one.
Please alias the DataFrames with different names via DataFrame.alias before joining them,
and specify the column using qualified name, e.g. df.alias("a").join(df.alias("b"), col("a.id") > col("b.id")). SQLSTATE: 42702
at org.apache.spark.sql.catalyst.analysis.ColumnResolutionHelper.findPlanById(ColumnResolutionHelper.scala:555)
at org.apache.spark.sql.catalyst.analysis.ColumnResolutionHelper.$anonfun$resolveUnresolvedAttributeByPlanId$2(ColumnResolutionHelper.scala:511)
at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:469)
at org.apache.spark.sql.catalyst.analysis.ColumnResolutionHelper.resolveUnresolvedAttributeByPlanId(ColumnResolutionHelper.scala:511)
at org.apache.spark.sql.catalyst.analysis.ColumnResolutionHelper.tryResolveColumnByPlanId(ColumnResolutionHelper.scala:494)
at org.apache.spark.sql.catalyst.analysis.ColumnResolutionHelper.$anonfun$tryResolveColumnByPlanId$2(ColumnResolutionHelper.scala:497)
at scala.collection.immutable.List.map(List.scala:246)