Details
-
Question
-
Status: Resolved
-
Minor
-
Resolution: Invalid
-
2.4.4
-
None
-
None
Description
It's took about 2mins for one 248 MB file. 2 files ~ 5 mins How can I tune or maximize the performance.
Initialize spark as below:
{{.setMaster(numCores)
.set("spark.driver.host", "localhost")
.set("spark.executor.cores","2")
.set("spark.num.executors","2")
.set("spark.executor.memory", "4g")
.set("spark.dynamicAllocation.enabled", "true")
.set("spark.dynamicAllocation.minExecutors","2")
.set("spark.dynamicAllocation.maxExecutors","2")
.set("spark.ui.enabled","true")
.set("spark.sql.shuffle.partitions",defaultPartitions)}}
{{}}
{{joinedDf = upperCaseLeft.as("l")
.join(upperCaseRight.as("r"), caseTransformedKeys, "full_outer")
.select(compositeKeysCol ::: nonKeyCols.map(col => mapHelper(col,toleranceValue,caseSensitive)): _*)}}
{{}}
{{}}
{{}}
data = joinedDf.take(1000)
{{}}
https://i.stack.imgur.com/oeYww.png{{}}
== Parsed Logical Plan ==
GlobalLimit 5
+- LocalLimit 5
+- Project [COL1#155, CASE WHEN (isnull(COL2#98) && isnull(COL2#114)) THEN [null] WHEN isnull(COL2#98) THEN concat([null]<==>, COL2#114) WHEN isnull(COL2#114) THEN concat(COL2#98, <==>[null]) WHEN ((upper(COL2#98) = upper(COL2#114)) && true) THEN concat(, COL2#98) WHEN (abs((cast(COL2#98 as double) - cast(COL2#114 as double))) <= 0.1) THEN concat(COL2#98, <OK>, COL2#114) ELSE concat(COL2#98, <==>, COL2#114) END AS COL2#171, CASE WHEN (isnull(COL3#99) && isnull(COL3#115)) THEN [null] WHEN isnull(COL3#99) THEN concat([null]<==>, COL3#115) WHEN isnull(COL3#115) THEN concat(COL3#99, <==>[null]) WHEN ((upper(COL3#99) = upper(COL3#115)) && true) THEN concat(, COL3#99) WHEN (abs((cast(COL3#99 as double) - cast(COL3#115 as double))) <= 0.1) THEN concat(COL3#99, <OK>, COL3#115) ELSE concat(COL3#99, <==>, COL3#115) END AS COL3#172, CASE WHEN (isnull(COL4#100) && isnull(COL4#116)) THEN [null] WHEN isnull(COL4#100) THEN concat([null]<==>, COL4#116) WHEN isnull(COL4#116) THEN concat(COL4#100, <==>[null]) WHEN ((upper(COL4#100) = upper(COL4#116)) && true) THEN concat(, COL4#100) WHEN (abs((cast(COL4#100 as double) - cast(COL4#116 as double))) <= 0.1) THEN concat(COL4#100, <OK>, COL4#116) ELSE concat(COL4#100, <==>, COL4#116) END AS COL4#173, CASE WHEN (isnull(COL5#101) && isnull(COL5#117)) THEN [null] WHEN isnull(COL5#101) THEN concat([null]<==>, COL5#117) WHEN isnull(COL5#117) THEN concat(COL5#101, <==>[null]) WHEN ((upper(COL5#101) = upper(COL5#117)) && true) THEN concat(, COL5#101) WHEN (abs((cast(COL5#101 as double) - cast(COL5#117 as double))) <= 0.1) THEN concat(COL5#101, <OK>, COL5#117) ELSE concat(COL5#101, <==>, COL5#117) END AS COL5#174, CASE WHEN (isnull(COL6#102) && isnull(COL6#118)) THEN [null] WHEN isnull(COL6#102) THEN concat([null]<==>, COL6#118) WHEN isnull(COL6#118) THEN concat(COL6#102, <==>[null]) WHEN ((upper(COL6#102) = upper(COL6#118)) && true) THEN concat(, COL6#102) WHEN (abs((cast(COL6#102 as double) - cast(COL6#118 as double))) <= 0.1) THEN concat(COL6#102, <OK>, COL6#118) ELSE concat(COL6#102, <==>, COL6#118) END AS COL6#175, CASE WHEN (isnull(COL7#103) && isnull(COL7#119)) THEN [null] WHEN isnull(COL7#103) THEN concat([null]<==>, COL7#119) WHEN isnull(COL7#119) THEN concat(COL7#103, <==>[null]) WHEN ((upper(COL7#103) = upper(COL7#119)) && true) THEN concat(, COL7#103) WHEN (abs((cast(COL7#103 as double) - cast(COL7#119 as double))) <= 0.1) THEN concat(COL7#103, <OK>, COL7#119) ELSE concat(COL7#103, <==>, COL7#119) END AS COL7#176, CASE WHEN (isnull(COL8#104) && isnull(COL8#120)) THEN [null] WHEN isnull(COL8#104) THEN concat([null]<==>, COL8#120) WHEN isnull(COL8#120) THEN concat(COL8#104, <==>[null]) WHEN ((upper(COL8#104) = upper(COL8#120)) && true) THEN concat(, COL8#104) WHEN (abs((cast(COL8#104 as double) - cast(COL8#120 as double))) <= 0.1) THEN concat(COL8#104, <OK>, COL8#120) ELSE concat(COL8#104, <==>, COL8#120) END AS COL8#177]
+- Project coalesce(COL1#97, COL1#113) AS COL1#155, COL2#98, COL3#99, COL4#100, COL5#101, COL6#102, COL7#103, COL8#104, COL2#114, COL3#115, COL4#116, COL5#117, COL6#118, COL7#119, COL8#120
+- Join FullOuter, (COL1#97 = COL1#113)
:- SubqueryAlias `l`
: +- ResolvedHint (broadcast)
: +- Project col1#10 AS COL1#97, col2#11 AS COL2#98, col3#12 AS COL3#99, col4#13 AS COL4#100, col5#14 AS COL5#101, col6#15 AS COL6#102, col7#16 AS COL7#103, col8#17 AS COL8#104
: +- Project col1#10, col2#11, col3#12, col4#13, col5#14, col6#15, col7#16, col8#17
: +- Relationcol1#10,col2#11,col3#12,col4#13,col5#14,col6#15,col7#16,col8#17 csv
+- SubqueryAlias `r`
+- ResolvedHint (broadcast)
+- Project col1#36 AS COL1#113, col2#37 AS COL2#114, col3#38 AS COL3#115, col4#39 AS COL4#116, col5#40 AS COL5#117, col6#41 AS COL6#118, col7#42 AS COL7#119, col8#43 AS COL8#120
+- Project col1#36, col2#37, col3#38, col4#39, col5#40, col6#41, col7#42, col8#43
+- Relationcol1#36,col2#37,col3#38,col4#39,col5#40,col6#41,col7#42,col8#43 csv
== Analyzed Logical Plan ==
COL1: string, COL2: string, COL3: string, COL4: string, COL5: string, COL6: string, COL7: string, COL8: string
GlobalLimit 5
+- LocalLimit 5
+- Project [COL1#155, CASE WHEN (isnull(COL2#98) && isnull(COL2#114)) THEN [null] WHEN isnull(COL2#98) THEN concat([null]<==>, COL2#114) WHEN isnull(COL2#114) THEN concat(COL2#98, <==>[null]) WHEN ((upper(COL2#98) = upper(COL2#114)) && true) THEN concat(, COL2#98) WHEN (abs((cast(COL2#98 as double) - cast(COL2#114 as double))) <= 0.1) THEN concat(COL2#98, <OK>, COL2#114) ELSE concat(COL2#98, <==>, COL2#114) END AS COL2#171, CASE WHEN (isnull(COL3#99) && isnull(COL3#115)) THEN [null] WHEN isnull(COL3#99) THEN concat([null]<==>, COL3#115) WHEN isnull(COL3#115) THEN concat(COL3#99, <==>[null]) WHEN ((upper(COL3#99) = upper(COL3#115)) && true) THEN concat(, COL3#99) WHEN (abs((cast(COL3#99 as double) - cast(COL3#115 as double))) <= 0.1) THEN concat(COL3#99, <OK>, COL3#115) ELSE concat(COL3#99, <==>, COL3#115) END AS COL3#172, CASE WHEN (isnull(COL4#100) && isnull(COL4#116)) THEN [null] WHEN isnull(COL4#100) THEN concat([null]<==>, COL4#116) WHEN isnull(COL4#116) THEN concat(COL4#100, <==>[null]) WHEN ((upper(COL4#100) = upper(COL4#116)) && true) THEN concat(, COL4#100) WHEN (abs((cast(COL4#100 as double) - cast(COL4#116 as double))) <= 0.1) THEN concat(COL4#100, <OK>, COL4#116) ELSE concat(COL4#100, <==>, COL4#116) END AS COL4#173, CASE WHEN (isnull(COL5#101) && isnull(COL5#117)) THEN [null] WHEN isnull(COL5#101) THEN concat([null]<==>, COL5#117) WHEN isnull(COL5#117) THEN concat(COL5#101, <==>[null]) WHEN ((upper(COL5#101) = upper(COL5#117)) && true) THEN concat(, COL5#101) WHEN (abs((cast(COL5#101 as double) - cast(COL5#117 as double))) <= 0.1) THEN concat(COL5#101, <OK>, COL5#117) ELSE concat(COL5#101, <==>, COL5#117) END AS COL5#174, CASE WHEN (isnull(COL6#102) && isnull(COL6#118)) THEN [null] WHEN isnull(COL6#102) THEN concat([null]<==>, COL6#118) WHEN isnull(COL6#118) THEN concat(COL6#102, <==>[null]) WHEN ((upper(COL6#102) = upper(COL6#118)) && true) THEN concat(, COL6#102) WHEN (abs((cast(COL6#102 as double) - cast(COL6#118 as double))) <= 0.1) THEN concat(COL6#102, <OK>, COL6#118) ELSE concat(COL6#102, <==>, COL6#118) END AS COL6#175, CASE WHEN (isnull(COL7#103) && isnull(COL7#119)) THEN [null] WHEN isnull(COL7#103) THEN concat([null]<==>, COL7#119) WHEN isnull(COL7#119) THEN concat(COL7#103, <==>[null]) WHEN ((upper(COL7#103) = upper(COL7#119)) && true) THEN concat(, COL7#103) WHEN (abs((cast(COL7#103 as double) - cast(COL7#119 as double))) <= 0.1) THEN concat(COL7#103, <OK>, COL7#119) ELSE concat(COL7#103, <==>, COL7#119) END AS COL7#176, CASE WHEN (isnull(COL8#104) && isnull(COL8#120)) THEN [null] WHEN isnull(COL8#104) THEN concat([null]<==>, COL8#120) WHEN isnull(COL8#120) THEN concat(COL8#104, <==>[null]) WHEN ((upper(COL8#104) = upper(COL8#120)) && true) THEN concat(, COL8#104) WHEN (abs((cast(COL8#104 as double) - cast(COL8#120 as double))) <= 0.1) THEN concat(COL8#104, <OK>, COL8#120) ELSE concat(COL8#104, <==>, COL8#120) END AS COL8#177]
+- Project coalesce(COL1#97, COL1#113) AS COL1#155, COL2#98, COL3#99, COL4#100, COL5#101, COL6#102, COL7#103, COL8#104, COL2#114, COL3#115, COL4#116, COL5#117, COL6#118, COL7#119, COL8#120
+- Join FullOuter, (COL1#97 = COL1#113)
:- SubqueryAlias `l`
: +- ResolvedHint (broadcast)
: +- Project col1#10 AS COL1#97, col2#11 AS COL2#98, col3#12 AS COL3#99, col4#13 AS COL4#100, col5#14 AS COL5#101, col6#15 AS COL6#102, col7#16 AS COL7#103, col8#17 AS COL8#104
: +- Project col1#10, col2#11, col3#12, col4#13, col5#14, col6#15, col7#16, col8#17
: +- Relationcol1#10,col2#11,col3#12,col4#13,col5#14,col6#15,col7#16,col8#17 csv
+- SubqueryAlias `r`
+- ResolvedHint (broadcast)
+- Project col1#36 AS COL1#113, col2#37 AS COL2#114, col3#38 AS COL3#115, col4#39 AS COL4#116, col5#40 AS COL5#117, col6#41 AS COL6#118, col7#42 AS COL7#119, col8#43 AS COL8#120
+- Project col1#36, col2#37, col3#38, col4#39, col5#40, col6#41, col7#42, col8#43
+- Relationcol1#36,col2#37,col3#38,col4#39,col5#40,col6#41,col7#42,col8#43 csv
== Optimized Logical Plan ==
GlobalLimit 5
+- LocalLimit 5
+- Project [coalesce(COL1#97, COL1#113) AS COL1#155, CASE WHEN (isnull(COL2#98) && isnull(COL2#114)) THEN [null] WHEN isnull(COL2#98) THEN concat([null]<==>, COL2#114) WHEN isnull(COL2#114) THEN concat(COL2#98, <==>[null]) WHEN (upper(COL2#98) = upper(COL2#114)) THEN concat(, COL2#98) WHEN (abs((cast(COL2#98 as double) - cast(COL2#114 as double))) <= 0.1) THEN concat(COL2#98, <OK>, COL2#114) ELSE concat(COL2#98, <==>, COL2#114) END AS COL2#171, CASE WHEN (isnull(COL3#99) && isnull(COL3#115)) THEN [null] WHEN isnull(COL3#99) THEN concat([null]<==>, COL3#115) WHEN isnull(COL3#115) THEN concat(COL3#99, <==>[null]) WHEN (upper(COL3#99) = upper(COL3#115)) THEN concat(, COL3#99) WHEN (abs((cast(COL3#99 as double) - cast(COL3#115 as double))) <= 0.1) THEN concat(COL3#99, <OK>, COL3#115) ELSE concat(COL3#99, <==>, COL3#115) END AS COL3#172, CASE WHEN (isnull(COL4#100) && isnull(COL4#116)) THEN [null] WHEN isnull(COL4#100) THEN concat([null]<==>, COL4#116) WHEN isnull(COL4#116) THEN concat(COL4#100, <==>[null]) WHEN (upper(COL4#100) = upper(COL4#116)) THEN concat(, COL4#100) WHEN (abs((cast(COL4#100 as double) - cast(COL4#116 as double))) <= 0.1) THEN concat(COL4#100, <OK>, COL4#116) ELSE concat(COL4#100, <==>, COL4#116) END AS COL4#173, CASE WHEN (isnull(COL5#101) && isnull(COL5#117)) THEN [null] WHEN isnull(COL5#101) THEN concat([null]<==>, COL5#117) WHEN isnull(COL5#117) THEN concat(COL5#101, <==>[null]) WHEN (upper(COL5#101) = upper(COL5#117)) THEN concat(, COL5#101) WHEN (abs((cast(COL5#101 as double) - cast(COL5#117 as double))) <= 0.1) THEN concat(COL5#101, <OK>, COL5#117) ELSE concat(COL5#101, <==>, COL5#117) END AS COL5#174, CASE WHEN (isnull(COL6#102) && isnull(COL6#118)) THEN [null] WHEN isnull(COL6#102) THEN concat([null]<==>, COL6#118) WHEN isnull(COL6#118) THEN concat(COL6#102, <==>[null]) WHEN (upper(COL6#102) = upper(COL6#118)) THEN concat(, COL6#102) WHEN (abs((cast(COL6#102 as double) - cast(COL6#118 as double))) <= 0.1) THEN concat(COL6#102, <OK>, COL6#118) ELSE concat(COL6#102, <==>, COL6#118) END AS COL6#175, CASE WHEN (isnull(COL7#103) && isnull(COL7#119)) THEN [null] WHEN isnull(COL7#103) THEN concat([null]<==>, COL7#119) WHEN isnull(COL7#119) THEN concat(COL7#103, <==>[null]) WHEN (upper(COL7#103) = upper(COL7#119)) THEN concat(, COL7#103) WHEN (abs((cast(COL7#103 as double) - cast(COL7#119 as double))) <= 0.1) THEN concat(COL7#103, <OK>, COL7#119) ELSE concat(COL7#103, <==>, COL7#119) END AS COL7#176, CASE WHEN (isnull(COL8#104) && isnull(COL8#120)) THEN [null] WHEN isnull(COL8#104) THEN concat([null]<==>, COL8#120) WHEN isnull(COL8#120) THEN concat(COL8#104, <==>[null]) WHEN (upper(COL8#104) = upper(COL8#120)) THEN concat(, COL8#104) WHEN (abs((cast(COL8#104 as double) - cast(COL8#120 as double))) <= 0.1) THEN concat(COL8#104, <OK>, COL8#120) ELSE concat(COL8#104, <==>, COL8#120) END AS COL8#177]
+- Join FullOuter, (COL1#97 = COL1#113)
:- ResolvedHint (broadcast)
: +- Project col1#10 AS COL1#97, col2#11 AS COL2#98, col3#12 AS COL3#99, col4#13 AS COL4#100, col5#14 AS COL5#101, col6#15 AS COL6#102, col7#16 AS COL7#103, col8#17 AS COL8#104
: +- Relationcol1#10,col2#11,col3#12,col4#13,col5#14,col6#15,col7#16,col8#17 csv
+- ResolvedHint (broadcast)
+- Project col1#36 AS COL1#113, col2#37 AS COL2#114, col3#38 AS COL3#115, col4#39 AS COL4#116, col5#40 AS COL5#117, col6#41 AS COL6#118, col7#42 AS COL7#119, col8#43 AS COL8#120
+- Relationcol1#36,col2#37,col3#38,col4#39,col5#40,col6#41,col7#42,col8#43 csv
== Physical Plan ==
CollectLimit 5
+- *(5) Project [coalesce(COL1#97, COL1#113) AS COL1#155, CASE WHEN (isnull(COL2#98) && isnull(COL2#114)) THEN [null] WHEN isnull(COL2#98) THEN concat([null]<==>, COL2#114) WHEN isnull(COL2#114) THEN concat(COL2#98, <==>[null]) WHEN (upper(COL2#98) = upper(COL2#114)) THEN concat(, COL2#98) WHEN (abs((cast(COL2#98 as double) - cast(COL2#114 as double))) <= 0.1) THEN concat(COL2#98, <OK>, COL2#114) ELSE concat(COL2#98, <==>, COL2#114) END AS COL2#171, CASE WHEN (isnull(COL3#99) && isnull(COL3#115)) THEN [null] WHEN isnull(COL3#99) THEN concat([null]<==>, COL3#115) WHEN isnull(COL3#115) THEN concat(COL3#99, <==>[null]) WHEN (upper(COL3#99) = upper(COL3#115)) THEN concat(, COL3#99) WHEN (abs((cast(COL3#99 as double) - cast(COL3#115 as double))) <= 0.1) THEN concat(COL3#99, <OK>, COL3#115) ELSE concat(COL3#99, <==>, COL3#115) END AS COL3#172, CASE WHEN (isnull(COL4#100) && isnull(COL4#116)) THEN [null] WHEN isnull(COL4#100) THEN concat([null]<==>, COL4#116) WHEN isnull(COL4#116) THEN concat(COL4#100, <==>[null]) WHEN (upper(COL4#100) = upper(COL4#116)) THEN concat(, COL4#100) WHEN (abs((cast(COL4#100 as double) - cast(COL4#116 as double))) <= 0.1) THEN concat(COL4#100, <OK>, COL4#116) ELSE concat(COL4#100, <==>, COL4#116) END AS COL4#173, CASE WHEN (isnull(COL5#101) && isnull(COL5#117)) THEN [null] WHEN isnull(COL5#101) THEN concat([null]<==>, COL5#117) WHEN isnull(COL5#117) THEN concat(COL5#101, <==>[null]) WHEN (upper(COL5#101) = upper(COL5#117)) THEN concat(, COL5#101) WHEN (abs((cast(COL5#101 as double) - cast(COL5#117 as double))) <= 0.1) THEN concat(COL5#101, <OK>, COL5#117) ELSE concat(COL5#101, <==>, COL5#117) END AS COL5#174, CASE WHEN (isnull(COL6#102) && isnull(COL6#118)) THEN [null] WHEN isnull(COL6#102) THEN concat([null]<==>, COL6#118) WHEN isnull(COL6#118) THEN concat(COL6#102, <==>[null]) WHEN (upper(COL6#102) = upper(COL6#118)) THEN concat(, COL6#102) WHEN (abs((cast(COL6#102 as double) - cast(COL6#118 as double))) <= 0.1) THEN concat(COL6#102, <OK>, COL6#118) ELSE concat(COL6#102, <==>, COL6#118) END AS COL6#175, CASE WHEN (isnull(COL7#103) && isnull(COL7#119)) THEN [null] WHEN isnull(COL7#103) THEN concat([null]<==>, COL7#119) WHEN isnull(COL7#119) THEN concat(COL7#103, <==>[null]) WHEN (upper(COL7#103) = upper(COL7#119)) THEN concat(, COL7#103) WHEN (abs((cast(COL7#103 as double) - cast(COL7#119 as double))) <= 0.1) THEN concat(COL7#103, <OK>, COL7#119) ELSE concat(COL7#103, <==>, COL7#119) END AS COL7#176, CASE WHEN (isnull(COL8#104) && isnull(COL8#120)) THEN [null] WHEN isnull(COL8#104) THEN concat([null]<==>, COL8#120) WHEN isnull(COL8#120) THEN concat(COL8#104, <==>[null]) WHEN (upper(COL8#104) = upper(COL8#120)) THEN concat(, COL8#104) WHEN (abs((cast(COL8#104 as double) - cast(COL8#120 as double))) <= 0.1) THEN concat(COL8#104, <OK>, COL8#120) ELSE concat(COL8#104, <==>, COL8#120) END AS COL8#177]
+- SortMergeJoin COL1#97, COL1#113, FullOuter
:- *(2) Sort COL1#97 ASC NULLS FIRST, false, 0
: +- Exchange hashpartitioning(COL1#97, 100)
: +- *(1) Project col1#10 AS COL1#97, col2#11 AS COL2#98, col3#12 AS COL3#99, col4#13 AS COL4#100, col5#14 AS COL5#101, col6#15 AS COL6#102, col7#16 AS COL7#103, col8#17 AS COL8#104
: +- *(1) FileScan csv col1#10,col2#11,col3#12,col4#13,col5#14,col6#15,col7#16,col8#17 Batched: false, Format: CSV, Location: InMemoryFileIndexfile:/C:/tmp/base.txt, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<col1:string,col2:string,col3:string,col4:string,col5:string,col6:string,col7:string,col8:s...
+- *(4) Sort COL1#113 ASC NULLS FIRST, false, 0
+- Exchange hashpartitioning(COL1#113, 100)
+- *(3) Project col1#36 AS COL1#113, col2#37 AS COL2#114, col3#38 AS COL3#115, col4#39 AS COL4#116, col5#40 AS COL5#117, col6#41 AS COL6#118, col7#42 AS COL7#119, col8#43 AS COL8#120
+- *(3) FileScan csv col1#36,col2#37,col3#38,col4#39,col5#40,col6#41,col7#42,col8#43 Batched: false, Format: CSV, Location: InMemoryFileIndexfile:/C:/tmp/test.txt, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<col1:string,col2:string,col3:string,col4:string,col5:string,col6:string,col7:string,col8:s...