From 3f586e9bd4f143451a268c892bbced5c00355f2d Mon Sep 17 00:00:00 2001 From: Sparksfyz Date: Tue, 16 Jun 2020 17:28:57 +0800 Subject: [PATCH] HIVE-23688: fix Vectorization IndexArrayOutOfBoundsException when read null values in map --- .../test/resources/testconfiguration.properties | 3 +- .../parquet/vector/VectorizedListColumnReader.java | 35 +++++++++++----- .../parquet_map_null_vectorization.q | 20 +++++++++ .../tez/parquet_map_null_vectorization.q.out | 48 ++++++++++++++++++++++ .../hive/ql/exec/vector/BytesColumnVector.java | 4 ++ 5 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/parquet_map_null_vectorization.q create mode 100644 ql/src/test/results/clientpositive/tez/parquet_map_null_vectorization.q.out diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index f430a131af..7b1f330103 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -120,6 +120,7 @@ minillap.query.files=\ parallel_colstats.q,\ parquet_complex_types_vectorization.q,\ parquet_map_type_vectorization.q,\ + parquet_map_null_vectorization.q,\ parquet_struct_type_vectorization.q,\ parquet_types_vectorization.q,\ partcols1.q,\ @@ -1239,4 +1240,4 @@ druid.llap.local.query.files=\ erasurecoding.only.query.files=\ erasure_commands.q,\ erasure_explain.q,\ - erasure_simple.q + erasure_simple.q \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java index 6136ce056d..3a0b025bba 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java @@ -114,6 +114,8 @@ private boolean fetchNextValue(PrimitiveObjectInspector.PrimitiveCategory catego } else { lastValue = readPrimitiveTypedRow(category); } + } else { + lastValue = null; } return true; } else { @@ -128,17 +130,12 @@ private boolean fetchNextValue(PrimitiveObjectInspector.PrimitiveCategory catego private void addElement(ListColumnVector lcv, List elements, PrimitiveObjectInspector.PrimitiveCategory category, int index) throws IOException { lcv.offsets[index] = elements.size(); - // Return directly if last value is null - if (definitionLevel < maxDefLevel) { - lcv.isNull[index] = true; - lcv.lengths[index] = 0; - // fetch the data from parquet data page for next call - fetchNextValue(category); - return; - } - do { // add all data for an element in ListColumnVector, get out the loop if there is no data or the data is for new element + if (definitionLevel < maxDefLevel) { + lcv.lengths[index] = 0; + lcv.noNulls = false; + } elements.add(lastValue); } while (fetchNextValue(category) && (repetitionLevel != 0)); @@ -279,6 +276,9 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor lcv.child = new LongColumnVector(total); for (int i = 0; i < valueList.size(); i++) { ((LongColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + if (valueList.get(i) == null) { + lcv.child.isNull[i] = true; + } } break; case INT: @@ -290,12 +290,18 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor lcv.child = new LongColumnVector(total); for (int i = 0; i < valueList.size(); i++) { ((LongColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + if (valueList.get(i) == null) { + lcv.child.isNull[i] = true; + } } break; case DOUBLE: lcv.child = new DoubleColumnVector(total); for (int i = 0; i < valueList.size(); i++) { ((DoubleColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + if (valueList.get(i) == null) { + lcv.child.isNull[i] = true; + } } break; case BINARY: @@ -306,13 +312,16 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor lcv.child.init(); for (int i = 0; i < valueList.size(); i++) { byte[] src = ((List) valueList).get(i); - ((BytesColumnVector) lcv.child).setRef(i, src, 0, src.length); + ((BytesColumnVector) lcv.child).setRef(i, src, 0, src == null ? 0 : src.length); } break; case FLOAT: lcv.child = new DoubleColumnVector(total); for (int i = 0; i < valueList.size(); i++) { ((DoubleColumnVector) lcv.child).vector[i] = ((List) valueList).get(i); + if (valueList.get(i) == null) { + lcv.child.isNull[i] = true; + } } break; case DECIMAL: @@ -323,6 +332,9 @@ private void fillColumnVector(PrimitiveObjectInspector.PrimitiveCategory categor lcv.child = new DecimalColumnVector(total, precision, scale); for (int i = 0; i < valueList.size(); i++) { ((DecimalColumnVector) lcv.child).vector[i].set(((List) valueList).get(i), scale); + if (valueList.get(i) == null) { + lcv.child.isNull[i] = true; + } } break; case INTERVAL_DAY_TIME: @@ -478,6 +490,9 @@ private boolean compareBytesColumnVector(BytesColumnVector cv1, BytesColumnVecto int length2 = cv2.vector.length; if (length1 == length2) { for (int i = 0; i < length1; i++) { + if (cv1.vector[i] == null && cv2.vector[i] == null) { + continue; + } int innerLen1 = cv1.vector[i].length; int innerLen2 = cv2.vector[i].length; if (innerLen1 == innerLen2) { diff --git a/ql/src/test/queries/clientpositive/parquet_map_null_vectorization.q b/ql/src/test/queries/clientpositive/parquet_map_null_vectorization.q new file mode 100644 index 0000000000..9f072e84da --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_map_null_vectorization.q @@ -0,0 +1,20 @@ +set hive.mapred.mode=nonstrict; +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +DROP TABLE parquet_map_type; + + +CREATE TABLE parquet_map_type ( +id int, +stringMap map +) stored as parquet; + + +insert overwrite table parquet_map_type +SELECT 1, MAP('k1', null, 'k2', 'v2'); + + +select id, stringMap from parquet_map_type; + +select id, stringMap['k1'] from parquet_map_type group by id, stringMap['k1']; diff --git a/ql/src/test/results/clientpositive/tez/parquet_map_null_vectorization.q.out b/ql/src/test/results/clientpositive/tez/parquet_map_null_vectorization.q.out new file mode 100644 index 0000000000..119b93c0eb --- /dev/null +++ b/ql/src/test/results/clientpositive/tez/parquet_map_null_vectorization.q.out @@ -0,0 +1,48 @@ +PREHOOK: query: DROP TABLE parquet_map_type +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE parquet_map_type +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_map_type ( +id int, +stringMap map +) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_map_type +POSTHOOK: query: CREATE TABLE parquet_map_type ( +id int, +stringMap map +) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_map_type +PREHOOK: query: insert overwrite table parquet_map_type +SELECT 1, MAP('k1', null, 'k2', 'v2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parquet_map_type +POSTHOOK: query: insert overwrite table parquet_map_type +SELECT 1, MAP('k1', null, 'k2', 'v2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parquet_map_type +POSTHOOK: Lineage: parquet_map_type.id SIMPLE [] +POSTHOOK: Lineage: parquet_map_type.stringmap EXPRESSION [] +PREHOOK: query: select id, stringMap from parquet_map_type +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_map_type +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select id, stringMap from parquet_map_type +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_map_type +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 {"k1":null,"k2":"v2"} +PREHOOK: query: select id, stringMap['k1'] from parquet_map_type group by id, stringMap['k1'] +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_map_type +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select id, stringMap['k1'] from parquet_map_type group by id, stringMap['k1'] +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_map_type +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 NULL diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java index 6618807136..4fd292350b 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -111,6 +111,10 @@ public void reset() { * @param length length of source byte sequence */ public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { + if (sourceBuf == null) { + this.isNull[elementNum] = true; + this.noNulls = false; + } vector[elementNum] = sourceBuf; this.start[elementNum] = start; this.length[elementNum] = length; -- 2.13.0