diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java index 12af77c123..04fa129e04 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java @@ -19,6 +19,7 @@ import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; @@ -52,6 +53,10 @@ public VectorizedListColumnReader(ColumnDescriptor descriptor, PageReader pageRe @Override public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException { ListColumnVector lcv = (ListColumnVector) column; + // before readBatch, initial the size of offsets & lengths as the default value, + // the actual size will be assigned in setChildrenInfo() after reading complete. + lcv.offsets = new long[VectorizedRowBatch.DEFAULT_SIZE]; + lcv.lengths = new long[VectorizedRowBatch.DEFAULT_SIZE]; // Because the length of ListColumnVector.child can't be known now, // the valueList will save all data for ListColumnVector temporary. List valueList = new ArrayList<>(); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java index 8ea5d25677..d241fc816e 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java @@ -166,6 +166,14 @@ public void testUnrepeatedStringWithoutNullListRead() throws Exception { removeFile(); } + @Test + public void testVectorizedRowBatchSizeChange() throws Exception { + removeFile(); + writeListData(initWriterFromFile(), false, 1200); + testVectorizedRowBatchSizeChangeListRead(); + removeFile(); + } + private void testListReadAllType(boolean isDictionaryEncoding, int elementNum) throws Exception { testListRead(isDictionaryEncoding, "int", elementNum); testListRead(isDictionaryEncoding, "long", elementNum); @@ -337,4 +345,29 @@ private void testUnRepeateStringWithoutNullListRead() throws Exception { reader.close(); } } + + private void testVectorizedRowBatchSizeChangeListRead() throws Exception { + Configuration conf = new Configuration(); + conf.set(IOConstants.COLUMNS, "list_binary_field_for_repeat_test"); + conf.set(IOConstants.COLUMNS_TYPES, "array"); + conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = createTestParquetReader( + "message hive_schema {repeated binary list_binary_field_for_repeat_test;}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + while (reader.next(NullWritable.get(), previous)) { + ListColumnVector vector = (ListColumnVector) previous.cols[0]; + // When deal with big data, the VectorizedRowBatch will be used for the different file split + // to cache the data. Here is the situation: the first split only have 100 rows, + // and VectorizedRowBatch cache them, meanwhile, the size of VectorizedRowBatch will be + // updated to 100. The following code is to simulate the size change, but there will be no + // ArrayIndexOutOfBoundsException when process the next split which has more than 100 rows. + vector.lengths = new long[100]; + vector.offsets = new long[100]; + } + } finally { + reader.close(); + } + } }