Uploaded image for project: 'Parquet'
  1. Parquet
  2. PARQUET-1739 Make Spark SQL support Column indexes
  3. PARQUET-1744

Some filters throws ArrayIndexOutOfBoundsException

    XMLWordPrintableJSON

Details

    Description

      How to reproduce:

      • Build Spark
        git clone https://github.com/apache/spark.git && cd spark
        git fetch origin pull/26804/head:PARQUET-1744
        git checkout PARQUET-1744
        build/sbt  package
        bin/spark-shell
        
      • Prepare data:
        spark.sql("create table t1(a int, b int, c int) using parquet")
        spark.sql("insert into t1 values(1,0,0)")
        spark.sql("insert into t1 values(2,0,1)")
        spark.sql("insert into t1 values(3,1,0)")
        spark.sql("insert into t1 values(4,1,1)")
        spark.sql("insert into t1 values(5,null,0)")
        spark.sql("insert into t1 values(6,null,1)")
        spark.sql("insert into t1 values(7,null,null)")
        
      • Run test 1
        scala> spark.sql("select a+120 from t1 where b<10 OR c=1").show
        java.lang.reflect.InvocationTargetException
        	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
        	at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:155)
        	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:131)
        	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.$anonfun$buildReaderWithPartitionValues$2(ParquetFileFormat.scala:319)
        	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
        	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
        	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
        	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:486)
        	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
        	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
        	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726)
        	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:339)
        	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
        	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
        	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
        	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
        	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
        	at org.apache.spark.scheduler.Task.run(Task.scala:127)
        	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:441)
        	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
        	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:444)
        	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
        	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
        	at java.base/java.lang.Thread.run(Thread.java:834)
        Caused by: java.lang.ArrayIndexOutOfBoundsException: Index -1 out of bounds for length 0
        	at org.apache.parquet.internal.column.columnindex.IntColumnIndexBuilder$IntColumnIndex$1.compareValueToMin(IntColumnIndexBuilder.java:74)
        	at org.apache.parquet.internal.column.columnindex.BoundaryOrder$2.lt(BoundaryOrder.java:123)
        	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:262)
        	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:64)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.lambda$visit$2(ColumnIndexFilter.java:131)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.applyPredicate(ColumnIndexFilter.java:176)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:131)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
        	at org.apache.parquet.filter2.predicate.Operators$Lt.accept(Operators.java:209)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:186)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
        	at org.apache.parquet.filter2.predicate.Operators$Or.accept(Operators.java:321)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:86)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:81)
        	at org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:137)
        	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges(ColumnIndexFilter.java:81)
        	at org.apache.parquet.hadoop.ParquetFileReader.getRowRanges(ParquetFileReader.java:961)
        	at org.apache.parquet.hadoop.ParquetFileReader.getFilteredRecordCount(ParquetFileReader.java:766)
        	... 29 more
        +---------+
        |(a + 120)|
        +---------+
        |      124|
        |      121|
        |      122|
        |      123|
        +---------+
        
      • Run test 2
      scala> spark.sql("select a+140 from t1 where not (b<10 AND c=1)").show
      java.lang.reflect.InvocationTargetException
      	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
      	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
      	at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:155)
      	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:131)
      	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.$anonfun$buildReaderWithPartitionValues$2(ParquetFileFormat.scala:319)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
      	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:486)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
      	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
      	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726)
      	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:339)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
      	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
      	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
      	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
      	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
      	at org.apache.spark.scheduler.Task.run(Task.scala:127)
      	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:441)
      	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:444)
      	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
      	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
      	at java.base/java.lang.Thread.run(Thread.java:834)
      Caused by: java.lang.ArrayIndexOutOfBoundsException: Index 0 out of bounds for length 0
      	at org.apache.parquet.internal.column.columnindex.IntColumnIndexBuilder$IntColumnIndex$1.compareValueToMax(IntColumnIndexBuilder.java:79)
      	at org.apache.parquet.internal.column.columnindex.BoundaryOrder$2.gtEq(BoundaryOrder.java:107)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:257)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:64)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.lambda$visit$5(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.applyPredicate(ColumnIndexFilter.java:176)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$GtEq.accept(Operators.java:249)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:186)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$Or.accept(Operators.java:321)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:86)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:81)
      	at org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:137)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges(ColumnIndexFilter.java:81)
      	at org.apache.parquet.hadoop.ParquetFileReader.getRowRanges(ParquetFileReader.java:961)
      	at org.apache.parquet.hadoop.ParquetFileReader.getFilteredRecordCount(ParquetFileReader.java:766)
      	... 29 more
      java.lang.reflect.InvocationTargetException
      	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
      	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
      	at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:155)
      	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:131)
      	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.$anonfun$buildReaderWithPartitionValues$2(ParquetFileFormat.scala:319)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
      	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:486)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
      	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
      	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726)
      	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:339)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
      	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
      	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
      	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
      	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
      	at org.apache.spark.scheduler.Task.run(Task.scala:127)
      	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:441)
      	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:444)
      	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
      	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
      	at java.base/java.lang.Thread.run(Thread.java:834)
      Caused by: java.lang.ArrayIndexOutOfBoundsException: Index 0 out of bounds for length 0
      	at org.apache.parquet.internal.column.columnindex.IntColumnIndexBuilder$IntColumnIndex$1.compareValueToMax(IntColumnIndexBuilder.java:79)
      	at org.apache.parquet.internal.column.columnindex.BoundaryOrder$2.gtEq(BoundaryOrder.java:107)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:257)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:64)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.lambda$visit$5(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.applyPredicate(ColumnIndexFilter.java:176)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$GtEq.accept(Operators.java:249)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:186)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$Or.accept(Operators.java:321)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:86)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:81)
      	at org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:137)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges(ColumnIndexFilter.java:81)
      	at org.apache.parquet.hadoop.ParquetFileReader.getRowRanges(ParquetFileReader.java:961)
      	at org.apache.parquet.hadoop.ParquetFileReader.getFilteredRecordCount(ParquetFileReader.java:766)
      	... 29 more
      +---------+
      |(a + 140)|
      +---------+
      |      141|
      |      143|
      +---------+
      
      • Run test 3
      scala> spark.sql("select a+150 from t1 where not (c=1 AND b<10)").show
      java.lang.reflect.InvocationTargetException
      	at jdk.internal.reflect.GeneratedMethodAccessor59.invoke(Unknown Source)
      	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
      	at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:155)
      	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:131)
      	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.$anonfun$buildReaderWithPartitionValues$2(ParquetFileFormat.scala:319)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
      	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:486)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
      	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
      	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726)
      	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:339)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
      	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
      	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
      	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
      	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
      	at org.apache.spark.scheduler.Task.run(Task.scala:127)
      	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:441)
      	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:444)
      	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
      	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
      	at java.base/java.lang.Thread.run(Thread.java:834)
      Caused by: java.lang.ArrayIndexOutOfBoundsException: Index 0 out of bounds for length 0
      	at org.apache.parquet.internal.column.columnindex.IntColumnIndexBuilder$IntColumnIndex$1.compareValueToMax(IntColumnIndexBuilder.java:79)
      	at org.apache.parquet.internal.column.columnindex.BoundaryOrder$2.gtEq(BoundaryOrder.java:107)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:257)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:64)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.lambda$visit$5(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.applyPredicate(ColumnIndexFilter.java:176)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$GtEq.accept(Operators.java:249)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:186)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$Or.accept(Operators.java:321)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:86)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:81)
      	at org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:137)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges(ColumnIndexFilter.java:81)
      	at org.apache.parquet.hadoop.ParquetFileReader.getRowRanges(ParquetFileReader.java:961)
      	at org.apache.parquet.hadoop.ParquetFileReader.getFilteredRecordCount(ParquetFileReader.java:766)
      	... 28 more
      java.lang.reflect.InvocationTargetException
      	at jdk.internal.reflect.GeneratedMethodAccessor59.invoke(Unknown Source)
      	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
      	at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:155)
      	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:131)
      	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.$anonfun$buildReaderWithPartitionValues$2(ParquetFileFormat.scala:319)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
      	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
      	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:486)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
      	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
      	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
      	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:726)
      	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:339)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
      	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
      	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
      	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
      	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
      	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
      	at org.apache.spark.scheduler.Task.run(Task.scala:127)
      	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:441)
      	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:444)
      	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
      	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
      	at java.base/java.lang.Thread.run(Thread.java:834)
      Caused by: java.lang.ArrayIndexOutOfBoundsException: Index 0 out of bounds for length 0
      	at org.apache.parquet.internal.column.columnindex.IntColumnIndexBuilder$IntColumnIndex$1.compareValueToMax(IntColumnIndexBuilder.java:79)
      	at org.apache.parquet.internal.column.columnindex.BoundaryOrder$2.gtEq(BoundaryOrder.java:107)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:257)
      	at org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder$ColumnIndexBase.visit(ColumnIndexBuilder.java:64)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.lambda$visit$5(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.applyPredicate(ColumnIndexFilter.java:176)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:146)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$GtEq.accept(Operators.java:249)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:186)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.visit(ColumnIndexFilter.java:56)
      	at org.apache.parquet.filter2.predicate.Operators$Or.accept(Operators.java:321)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:86)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter$1.visit(ColumnIndexFilter.java:81)
      	at org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:137)
      	at org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges(ColumnIndexFilter.java:81)
      	at org.apache.parquet.hadoop.ParquetFileReader.getRowRanges(ParquetFileReader.java:961)
      	at org.apache.parquet.hadoop.ParquetFileReader.getFilteredRecordCount(ParquetFileReader.java:766)
      	... 28 more
      +---------+
      |(a + 150)|
      +---------+
      |      151|
      |      153|
      +---------+
      

      Attachments

        Issue Links

          Activity

            People

              gszadovszky Gabor Szadovszky
              yumwang Yuming Wang
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: