Description
When I run a sql with join, task throw java.lang.OutOfMemoryError and sql failed. I have set spark.executor.memory 4096m.
SortMergeJoin use a ArrayBuffer[InternalRow] to store bufferedMatches, if the join rows have a lot of same key, it will throw OutOfMemoryError.
/** Buffered rows from the buffered side of the join. This is empty if there are no matches. */ private[this] val bufferedMatches: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow]
Here is the stackTrace:
org.xerial.snappy.SnappyNative.arrayCopy(Native Method) org.xerial.snappy.Snappy.arrayCopy(Snappy.java:84) org.xerial.snappy.SnappyInputStream.rawRead(SnappyInputStream.java:190) org.xerial.snappy.SnappyInputStream.read(SnappyInputStream.java:163) java.io.DataInputStream.readFully(DataInputStream.java:195) java.io.DataInputStream.readLong(DataInputStream.java:416) org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.loadNext(UnsafeSorterSpillReader.java:71) org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillMerger$2.loadNext(UnsafeSorterSpillMerger.java:79) org.apache.spark.sql.execution.UnsafeExternalRowSorter$1.next(UnsafeExternalRowSorter.java:136) org.apache.spark.sql.execution.UnsafeExternalRowSorter$1.next(UnsafeExternalRowSorter.java:123) org.apache.spark.sql.execution.RowIteratorFromScala.advanceNext(RowIterator.scala:84) org.apache.spark.sql.execution.joins.SortMergeJoinScanner.advancedBufferedToRowWithNullFreeJoinKey(SortMergeJoin.scala:300) org.apache.spark.sql.execution.joins.SortMergeJoinScanner.bufferMatchingRows(SortMergeJoin.scala:329) org.apache.spark.sql.execution.joins.SortMergeJoinScanner.findNextInnerJoinRows(SortMergeJoin.scala:229) org.apache.spark.sql.execution.joins.SortMergeJoin$$anonfun$doExecute$1$$anon$1.advanceNext(SortMergeJoin.scala:105) org.apache.spark.sql.execution.RowIteratorToScala.hasNext(RowIterator.scala:68) scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1$$anonfun$2.apply(TungstenAggregate.scala:88) org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1$$anonfun$2.apply(TungstenAggregate.scala:86) org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:741) org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:741) org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:337) org.apache.spark.rdd.RDD.iterator(RDD.scala:301) org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:337) org.apache.spark.rdd.RDD.iterator(RDD.scala:301) org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73) org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) org.apache.spark.scheduler.Task.run(Task.scala:89) org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:215) java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) java.lang.Thread.run(Thread.java:744)