diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index e188619..3f6fbe4 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -730,6 +730,7 @@ minillaplocal.query.files=\ vector_char_varchar_1.q,\ vector_complex_all.q,\ vector_complex_join.q,\ + vector_create_struct_table.q,\ vector_decimal_2.q,\ vector_decimal_udf.q,\ vector_groupby_cube1.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java index f429308..d0961b3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractRow.java @@ -90,6 +90,9 @@ TypeInfo[] typeInfos; ObjectInspector[] objectInspectors; + private static final byte[] EMPTY_BYTES = new byte[0]; + private static final String EMPTY_STRING = ""; + /* * Allocate the various arrays. */ @@ -257,18 +260,15 @@ public Object extractRowColumn( final int start = bytesColVector.start[adjustedIndex]; final int length = bytesColVector.length[adjustedIndex]; - if (bytesColVector.isRepeating) { - if (!bytesColVector.isNull[0] && bytes == null) { + BytesWritable bytesWritable = (BytesWritable) primitiveWritable; + if (bytes == null || length == 0) { + if (length > 0) { nullBytesReadError(primitiveCategory, batchIndex); } + bytesWritable.set(EMPTY_BYTES, 0, 0); } else { - if ((bytesColVector.noNulls || !bytesColVector.isNull[batchIndex]) && bytes == null) { - nullBytesReadError(primitiveCategory, batchIndex); - } + bytesWritable.set(bytes, start, length); } - - BytesWritable bytesWritable = (BytesWritable) primitiveWritable; - bytesWritable.set(bytes, start, length); return primitiveWritable; } case STRING: @@ -279,18 +279,16 @@ public Object extractRowColumn( final int start = bytesColVector.start[adjustedIndex]; final int length = bytesColVector.length[adjustedIndex]; - if (bytesColVector.isRepeating) { - if (!bytesColVector.isNull[0] && bytes == null) { + if (bytes == null || length == 0) { + if (length > 0) { nullBytesReadError(primitiveCategory, batchIndex); } + ((Text) primitiveWritable).set(EMPTY_BYTES, 0, 0); } else { - if ((bytesColVector.noNulls || !bytesColVector.isNull[batchIndex]) && bytes == null) { - nullBytesReadError(primitiveCategory, batchIndex); - } - } - // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. - ((Text) primitiveWritable).set(bytes, start, length); + // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. + ((Text) primitiveWritable).set(bytes, start, length); + } return primitiveWritable; } case VARCHAR: @@ -301,21 +299,23 @@ public Object extractRowColumn( final int start = bytesColVector.start[adjustedIndex]; final int length = bytesColVector.length[adjustedIndex]; - if (bytesColVector.isRepeating) { - if (!bytesColVector.isNull[0] && bytes == null) { + final HiveVarcharWritable hiveVarcharWritable = (HiveVarcharWritable) primitiveWritable; + if (bytes == null || length == 0) { + if (length > 0) { nullBytesReadError(primitiveCategory, batchIndex); } + hiveVarcharWritable.set(EMPTY_STRING, -1); } else { - if ((bytesColVector.noNulls || !bytesColVector.isNull[batchIndex]) && bytes == null) { - nullBytesReadError(primitiveCategory, batchIndex); + final int adjustedLength = + StringExpr.truncate( + bytes, start, length, ((VarcharTypeInfo) primitiveTypeInfo).getLength()); + if (adjustedLength == 0) { + hiveVarcharWritable.set(EMPTY_STRING, -1); + } else { + hiveVarcharWritable.set( + new String(bytes, start, adjustedLength, Charsets.UTF_8), -1); } } - - final int adjustedLength = StringExpr.truncate(bytes, start, length, - ((VarcharTypeInfo) primitiveTypeInfo).getLength()); - - final HiveVarcharWritable hiveVarcharWritable = (HiveVarcharWritable) primitiveWritable; - hiveVarcharWritable.set(new String(bytes, start, adjustedLength, Charsets.UTF_8), -1); return primitiveWritable; } case CHAR: @@ -326,22 +326,24 @@ public Object extractRowColumn( final int start = bytesColVector.start[adjustedIndex]; final int length = bytesColVector.length[adjustedIndex]; - if (bytesColVector.isRepeating) { - if (!bytesColVector.isNull[0] && bytes == null) { + final HiveCharWritable hiveCharWritable = (HiveCharWritable) primitiveWritable; + final int maxLength = ((CharTypeInfo) primitiveTypeInfo).getLength(); + if (bytes == null || length == 0) { + if (length > 0) { nullBytesReadError(primitiveCategory, batchIndex); } + hiveCharWritable.set(EMPTY_STRING, maxLength); } else { - if ((bytesColVector.noNulls || !bytesColVector.isNull[batchIndex]) && bytes == null) { - nullBytesReadError(primitiveCategory, batchIndex); + final int adjustedLength = StringExpr.rightTrimAndTruncate(bytes, start, length, + ((CharTypeInfo) primitiveTypeInfo).getLength()); + + if (adjustedLength == 0) { + hiveCharWritable.set(EMPTY_STRING, maxLength); + } else { + hiveCharWritable.set( + new String(bytes, start, adjustedLength, Charsets.UTF_8), maxLength); } } - - final int adjustedLength = StringExpr.rightTrimAndTruncate(bytes, start, length, - ((CharTypeInfo) primitiveTypeInfo).getLength()); - - final HiveCharWritable hiveCharWritable = (HiveCharWritable) primitiveWritable; - hiveCharWritable.set(new String(bytes, start, adjustedLength, Charsets.UTF_8), - ((CharTypeInfo) primitiveTypeInfo).getLength()); return primitiveWritable; } case DECIMAL: diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFStructField.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFStructField.java index 0507fa5..b40126a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFStructField.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFStructField.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; +import java.util.Arrays; + import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; @@ -44,36 +46,96 @@ public VectorUDFStructField(int structColumnNum, int fieldIndex, int outputColum @Override public void evaluate(VectorizedRowBatch batch) { + + // return immediately if batch is empty + final int n = batch.size; + if (n == 0) { + return; + } + if (childExpressions != null) { super.evaluateChildren(batch); } ColumnVector outV = batch.cols[outputColumnNum]; + int[] sel = batch.selected; StructColumnVector structColumnVector = (StructColumnVector) batch.cols[structColumnNum]; ColumnVector fieldColumnVector = structColumnVector.fields[fieldIndex]; - outV.noNulls = true; + boolean[] inputIsNull = structColumnVector.isNull; + boolean[] outputIsNull = outV.isNull; + + // We do not need to do a column reset since we are carefully changing the output. + outV.isRepeating = false; + if (structColumnVector.isRepeating) { - if (structColumnVector.isNull[0]) { - outV.isNull[0] = true; - outV.noNulls = false; - } else { + if (structColumnVector.noNulls || !structColumnVector.isNull[0]) { + outputIsNull[0] = false; outV.setElement(0, 0, fieldColumnVector); - outV.isNull[0] = false; + } else { + outputIsNull[0] = true; + outV.noNulls = false; } outV.isRepeating = true; - } else { - for (int i = 0; i < batch.size; i++) { - int j = (batch.selectedInUse) ? batch.selected[i] : i; - if (structColumnVector.isNull[j]) { - outV.isNull[j] = true; - outV.noNulls = false; + return; + } + if (structColumnVector.noNulls) { + if (batch.selectedInUse) { + + // CONSIDER: For large n, fill n or all of isNull array and use the tighter ELSE loop. + + if (!outV.noNulls) { + for(int j = 0; j != n; j++) { + final int i = sel[j]; + outputIsNull[i] = false; + outV.setElement(i, i, fieldColumnVector); + } } else { - outV.setElement(j, j, fieldColumnVector); - outV.isNull[j] = false; + for(int j = 0; j != n; j++) { + final int i = sel[j]; + outV.setElement(i, i, fieldColumnVector); + } + } + } else { + if (!outV.noNulls) { + + // Assume it is almost always a performance win to fill all of isNull so we can + // safely reset noNulls. + Arrays.fill(outputIsNull, false); + outV.noNulls = true; + } + for(int i = 0; i != n; i++) { + outV.setElement(i, i, fieldColumnVector); + } + } + } else /* there are NULLs in the structColumnVector */ { + + /* + * Do careful maintenance of the outputColVector.noNulls flag. + */ + + if (batch.selectedInUse) { + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!inputIsNull[i]) { + outputIsNull[i] = false; + outV.setElement(i, i, fieldColumnVector); + } else { + outputIsNull[i] = true; + outV.noNulls = false; + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inputIsNull[i]) { + outputIsNull[i] = false; + outV.setElement(i, i, fieldColumnVector); + } else { + outputIsNull[i] = true; + outV.noNulls = false; + } } } - outV.isRepeating = false; } } diff --git ql/src/test/queries/clientpositive/create_struct_table.q ql/src/test/queries/clientpositive/create_struct_table.q index 1e5d151..fafe52c 100644 --- ql/src/test/queries/clientpositive/create_struct_table.q +++ ql/src/test/queries/clientpositive/create_struct_table.q @@ -1,12 +1,35 @@ +SET hive.vectorized.execution.enabled=false; -create table abc(strct struct) +-- The kv1 input file has 2 data fields, so when the 3 field struct is deserialized, +-- the premature end will put a NULL in field #3. +create table string_fields(strct struct) row format delimited fields terminated by '\t' collection items terminated by '\001'; load data local inpath '../../data/files/kv1.txt' -overwrite into table abc; +overwrite into table string_fields; -SELECT strct, strct.a, strct.b FROM abc LIMIT 10; +SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10; +create table char_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001'; + +load data local inpath '../../data/files/kv1.txt' +overwrite into table char_fields; + +SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10; + + +create table varchar_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001'; + +load data local inpath '../../data/files/kv1.txt' +overwrite into table varchar_fields; + +SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10; diff --git ql/src/test/queries/clientpositive/vector_create_struct_table.q ql/src/test/queries/clientpositive/vector_create_struct_table.q new file mode 100644 index 0000000..db26cb2 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_create_struct_table.q @@ -0,0 +1,45 @@ +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +-- The kv1 input file has 2 data fields, so when the 3 field struct is deserialized, +-- the premature end will put a NULL in field #3. +create table string_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001'; + +load data local inpath '../../data/files/kv1.txt' +overwrite into table string_fields; + +EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10; + +SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10; + + +create table char_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001'; + +load data local inpath '../../data/files/kv1.txt' +overwrite into table char_fields; + +EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10; + +SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10; + + +create table varchar_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001'; + +load data local inpath '../../data/files/kv1.txt' +overwrite into table varchar_fields; + +EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10; + +SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10; diff --git ql/src/test/results/clientpositive/create_struct_table.q.out ql/src/test/results/clientpositive/create_struct_table.q.out index f4c7829..f2fd893 100644 --- ql/src/test/results/clientpositive/create_struct_table.q.out +++ ql/src/test/results/clientpositive/create_struct_table.q.out @@ -1,42 +1,126 @@ -PREHOOK: query: create table abc(strct struct) +PREHOOK: query: create table string_fields(strct struct) row format delimited fields terminated by '\t' collection items terminated by '\001' PREHOOK: type: CREATETABLE PREHOOK: Output: database:default -PREHOOK: Output: default@abc -POSTHOOK: query: create table abc(strct struct) +PREHOOK: Output: default@string_fields +POSTHOOK: query: create table string_fields(strct struct) row format delimited fields terminated by '\t' collection items terminated by '\001' POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default -POSTHOOK: Output: default@abc +POSTHOOK: Output: default@string_fields PREHOOK: query: load data local inpath '../../data/files/kv1.txt' -overwrite into table abc +overwrite into table string_fields PREHOOK: type: LOAD #### A masked pattern was here #### -PREHOOK: Output: default@abc +PREHOOK: Output: default@string_fields POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' -overwrite into table abc +overwrite into table string_fields POSTHOOK: type: LOAD #### A masked pattern was here #### -POSTHOOK: Output: default@abc -PREHOOK: query: SELECT strct, strct.a, strct.b FROM abc LIMIT 10 +POSTHOOK: Output: default@string_fields +PREHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10 PREHOOK: type: QUERY -PREHOOK: Input: default@abc +PREHOOK: Input: default@string_fields #### A masked pattern was here #### -POSTHOOK: query: SELECT strct, strct.a, strct.b FROM abc LIMIT 10 +POSTHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10 POSTHOOK: type: QUERY -POSTHOOK: Input: default@abc -#### A masked pattern was here #### -{"a":238,"b":"val_238","c":null} 238 val_238 -{"a":86,"b":"val_86","c":null} 86 val_86 -{"a":311,"b":"val_311","c":null} 311 val_311 -{"a":27,"b":"val_27","c":null} 27 val_27 -{"a":165,"b":"val_165","c":null} 165 val_165 -{"a":409,"b":"val_409","c":null} 409 val_409 -{"a":255,"b":"val_255","c":null} 255 val_255 -{"a":278,"b":"val_278","c":null} 278 val_278 -{"a":98,"b":"val_98","c":null} 98 val_98 -{"a":484,"b":"val_484","c":null} 484 val_484 +POSTHOOK: Input: default@string_fields +#### A masked pattern was here #### +{"a":238,"b":"val_238","c":null} 238 val_238 NULL +{"a":86,"b":"val_86","c":null} 86 val_86 NULL +{"a":311,"b":"val_311","c":null} 311 val_311 NULL +{"a":27,"b":"val_27","c":null} 27 val_27 NULL +{"a":165,"b":"val_165","c":null} 165 val_165 NULL +{"a":409,"b":"val_409","c":null} 409 val_409 NULL +{"a":255,"b":"val_255","c":null} 255 val_255 NULL +{"a":278,"b":"val_278","c":null} 278 val_278 NULL +{"a":98,"b":"val_98","c":null} 98 val_98 NULL +{"a":484,"b":"val_484","c":null} 484 val_484 NULL +PREHOOK: query: create table char_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@char_fields +POSTHOOK: query: create table char_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@char_fields +PREHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table char_fields +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@char_fields +POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table char_fields +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@char_fields +PREHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@char_fields +#### A masked pattern was here #### +POSTHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@char_fields +#### A masked pattern was here #### +{"a":238,"b":"val_238 ","c":null} 238 val_238 NULL +{"a":86,"b":"val_86 ","c":null} 86 val_86 NULL +{"a":311,"b":"val_311 ","c":null} 311 val_311 NULL +{"a":27,"b":"val_27 ","c":null} 27 val_27 NULL +{"a":165,"b":"val_165 ","c":null} 165 val_165 NULL +{"a":409,"b":"val_409 ","c":null} 409 val_409 NULL +{"a":255,"b":"val_255 ","c":null} 255 val_255 NULL +{"a":278,"b":"val_278 ","c":null} 278 val_278 NULL +{"a":98,"b":"val_98 ","c":null} 98 val_98 NULL +{"a":484,"b":"val_484 ","c":null} 484 val_484 NULL +PREHOOK: query: create table varchar_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@varchar_fields +POSTHOOK: query: create table varchar_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@varchar_fields +PREHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table varchar_fields +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@varchar_fields +POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table varchar_fields +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@varchar_fields +PREHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_fields +#### A masked pattern was here #### +POSTHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_fields +#### A masked pattern was here #### +{"a":238,"b":"val_2","c":null} 238 val_2 NULL +{"a":86,"b":"val_8","c":null} 86 val_8 NULL +{"a":311,"b":"val_3","c":null} 311 val_3 NULL +{"a":27,"b":"val_2","c":null} 27 val_2 NULL +{"a":165,"b":"val_1","c":null} 165 val_1 NULL +{"a":409,"b":"val_4","c":null} 409 val_4 NULL +{"a":255,"b":"val_2","c":null} 255 val_2 NULL +{"a":278,"b":"val_2","c":null} 278 val_2 NULL +{"a":98,"b":"val_9","c":null} 98 val_9 NULL +{"a":484,"b":"val_4","c":null} 484 val_4 NULL diff --git ql/src/test/results/clientpositive/llap/vector_create_struct_table.q.out ql/src/test/results/clientpositive/llap/vector_create_struct_table.q.out new file mode 100644 index 0000000..14d1803 --- /dev/null +++ ql/src/test/results/clientpositive/llap/vector_create_struct_table.q.out @@ -0,0 +1,336 @@ +PREHOOK: query: create table string_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@string_fields +POSTHOOK: query: create table string_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@string_fields +PREHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table string_fields +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@string_fields +POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table string_fields +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@string_fields +PREHOOK: query: EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: string_fields + Statistics: Num rows: 1 Data size: 428 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + Select Operator + expressions: strct (type: struct), strct.a (type: int), strct.b (type: string), strct.c (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 2, 3, 4] + selectExpressions: VectorUDFStructField(col 0:struct, col 0:int) -> 2:int, VectorUDFStructField(col 0:struct, col 1:int) -> 3:string, VectorUDFStructField(col 0:struct, col 2:int) -> 4:string + Statistics: Num rows: 1 Data size: 428 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Limit Vectorization: + className: VectorLimitOperator + native: true + Statistics: Num rows: 1 Data size: 428 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 1 Data size: 428 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + vectorizationSupportRemovedReasons: [DECIMAL_64 disabled because LLAP is enabled] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@string_fields +#### A masked pattern was here #### +POSTHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM string_fields LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@string_fields +#### A masked pattern was here #### +{"a":238,"b":"val_238","c":null} 238 val_238 NULL +{"a":86,"b":"val_86","c":null} 86 val_86 NULL +{"a":311,"b":"val_311","c":null} 311 val_311 NULL +{"a":27,"b":"val_27","c":null} 27 val_27 NULL +{"a":165,"b":"val_165","c":null} 165 val_165 NULL +{"a":409,"b":"val_409","c":null} 409 val_409 NULL +{"a":255,"b":"val_255","c":null} 255 val_255 NULL +{"a":278,"b":"val_278","c":null} 278 val_278 NULL +{"a":98,"b":"val_98","c":null} 98 val_98 NULL +{"a":484,"b":"val_484","c":null} 484 val_484 NULL +PREHOOK: query: create table char_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@char_fields +POSTHOOK: query: create table char_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@char_fields +PREHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table char_fields +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@char_fields +POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table char_fields +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@char_fields +PREHOOK: query: EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: char_fields + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + Select Operator + expressions: strct (type: struct), strct.a (type: int), strct.b (type: char(10)), strct.c (type: char(10)) + outputColumnNames: _col0, _col1, _col2, _col3 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 2, 3, 4] + selectExpressions: VectorUDFStructField(col 0:struct, col 0:int) -> 2:int, VectorUDFStructField(col 0:struct, col 1:int) -> 3:char(10), VectorUDFStructField(col 0:struct, col 2:int) -> 4:char(10) + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Limit Vectorization: + className: VectorLimitOperator + native: true + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 1 Data size: 248 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + vectorizationSupportRemovedReasons: [DECIMAL_64 disabled because LLAP is enabled] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@char_fields +#### A masked pattern was here #### +POSTHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM char_fields LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@char_fields +#### A masked pattern was here #### +{"a":238,"b":"val_238 ","c":null} 238 val_238 NULL +{"a":86,"b":"val_86 ","c":null} 86 val_86 NULL +{"a":311,"b":"val_311 ","c":null} 311 val_311 NULL +{"a":27,"b":"val_27 ","c":null} 27 val_27 NULL +{"a":165,"b":"val_165 ","c":null} 165 val_165 NULL +{"a":409,"b":"val_409 ","c":null} 409 val_409 NULL +{"a":255,"b":"val_255 ","c":null} 255 val_255 NULL +{"a":278,"b":"val_278 ","c":null} 278 val_278 NULL +{"a":98,"b":"val_98 ","c":null} 98 val_98 NULL +{"a":484,"b":"val_484 ","c":null} 484 val_484 NULL +PREHOOK: query: create table varchar_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@varchar_fields +POSTHOOK: query: create table varchar_fields(strct struct) +row format delimited + fields terminated by '\t' + collection items terminated by '\001' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@varchar_fields +PREHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table varchar_fields +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@varchar_fields +POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' +overwrite into table varchar_fields +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@varchar_fields +PREHOOK: query: EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN VECTORIZATION EXPRESSION +SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: varchar_fields + Statistics: Num rows: 1 Data size: 238 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + Select Operator + expressions: strct (type: struct), strct.a (type: int), strct.b (type: varchar(5)), strct.c (type: varchar(5)) + outputColumnNames: _col0, _col1, _col2, _col3 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 2, 3, 4] + selectExpressions: VectorUDFStructField(col 0:struct, col 0:int) -> 2:int, VectorUDFStructField(col 0:struct, col 1:int) -> 3:varchar(5), VectorUDFStructField(col 0:struct, col 2:int) -> 4:varchar(5) + Statistics: Num rows: 1 Data size: 238 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Limit Vectorization: + className: VectorLimitOperator + native: true + Statistics: Num rows: 1 Data size: 238 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 1 Data size: 238 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + vectorizationSupportRemovedReasons: [DECIMAL_64 disabled because LLAP is enabled] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_fields +#### A masked pattern was here #### +POSTHOOK: query: SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_fields +#### A masked pattern was here #### +{"a":238,"b":"val_2","c":null} 238 val_2 NULL +{"a":86,"b":"val_8","c":null} 86 val_8 NULL +{"a":311,"b":"val_3","c":null} 311 val_3 NULL +{"a":27,"b":"val_2","c":null} 27 val_2 NULL +{"a":165,"b":"val_1","c":null} 165 val_1 NULL +{"a":409,"b":"val_4","c":null} 409 val_4 NULL +{"a":255,"b":"val_2","c":null} 255 val_2 NULL +{"a":278,"b":"val_2","c":null} 278 val_2 NULL +{"a":98,"b":"val_9","c":null} 98 val_9 NULL +{"a":484,"b":"val_4","c":null} 484 val_4 NULL diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index fe0ee48..dd88da8 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -497,6 +497,12 @@ private void topLevelParse() { private int parseComplexField(int start, int end, int level) { + if (start == end + 1) { + + // Data prematurely ended. Return start - 1 so we don't move our field position. + return start - 1; + } + final byte separator = separators[level]; int fieldByteEnd = start; @@ -996,7 +1002,9 @@ public boolean readComplexField() throws IOException { final ListComplexTypeHelper listHelper = (ListComplexTypeHelper) complexTypeHelper; final int fieldPosition = listHelper.fieldPosition; final int complexFieldEnd = listHelper.complexFieldEnd; - Preconditions.checkState(fieldPosition <= complexFieldEnd); + + // When data is prematurely ended the fieldPosition will be 1 more than the end. + Preconditions.checkState(fieldPosition <= complexFieldEnd + 1); final int fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel); listHelper.fieldPosition = fieldEnd + 1; // Move past separator. @@ -1011,7 +1019,9 @@ public boolean readComplexField() throws IOException { final MapComplexTypeHelper mapHelper = (MapComplexTypeHelper) complexTypeHelper; final int fieldPosition = mapHelper.fieldPosition; final int complexFieldEnd = mapHelper.complexFieldEnd; - Preconditions.checkState(fieldPosition <= complexFieldEnd); + + // When data is prematurely ended the fieldPosition will be 1 more than the end. + Preconditions.checkState(fieldPosition <= complexFieldEnd + 1); currentFieldStart = fieldPosition; @@ -1057,7 +1067,9 @@ public boolean readComplexField() throws IOException { final StructComplexTypeHelper structHelper = (StructComplexTypeHelper) complexTypeHelper; final int fieldPosition = structHelper.fieldPosition; final int complexFieldEnd = structHelper.complexFieldEnd; - Preconditions.checkState(fieldPosition <= complexFieldEnd); + + // When data is prematurely ended the fieldPosition will be 1 more than the end. + Preconditions.checkState(fieldPosition <= complexFieldEnd + 1); currentFieldStart = fieldPosition; @@ -1069,7 +1081,7 @@ public boolean readComplexField() throws IOException { // Parse until field separator (currentLevel). fieldEnd = parseComplexField(fieldPosition, complexFieldEnd, currentLevel); - structHelper.fieldPosition = fieldEnd + 1; // Move past key separator. + structHelper.fieldPosition = fieldEnd + 1; // Move past parent field separator. currentFieldLength = fieldEnd - fieldPosition; @@ -1101,7 +1113,9 @@ public boolean readComplexField() throws IOException { final UnionComplexTypeHelper unionHelper = (UnionComplexTypeHelper) complexTypeHelper; final int fieldPosition = unionHelper.fieldPosition; final int complexFieldEnd = unionHelper.complexFieldEnd; - Preconditions.checkState(fieldPosition <= complexFieldEnd); + + // When data is prematurely ended the fieldPosition will be 1 more than the end. + Preconditions.checkState(fieldPosition <= complexFieldEnd + 1); currentFieldStart = fieldPosition; diff --git storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java index 5e25c47..953604c 100644 --- storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -181,7 +181,9 @@ public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { if ((nextFree + length) > buffer.length) { increaseBufferSpace(length); } - System.arraycopy(sourceBuf, start, buffer, nextFree, length); + if (length > 0) { + System.arraycopy(sourceBuf, start, buffer, nextFree, length); + } vector[elementNum] = buffer; this.start[elementNum] = nextFree; this.length[elementNum] = length;