diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 32291223c5..6222bb1df3 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -784,6 +784,7 @@ minillaplocal.query.files=\ udaf_collect_set_2.q,\ udaf_all_keyword.q,\ udf_coalesce.q,\ + udf_mask_vectorized.q,\ llap_notequalns.q,\ union_assertion_type.q,\ union_fast_stats.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskBaseVector.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskBaseVector.java new file mode 100644 index 0000000000..e6f2c53d35 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskBaseVector.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import org.apache.hadoop.hive.ql.metadata.HiveException; + +public abstract class UDFMaskBaseVector extends VectorExpression { + + private static final long serialVersionUID = 1L; + + static final int MASKED_UPPERCASE = 'X'; + static final int MASKED_LOWERCASE = 'x'; + static final int MASKED_DIGIT = 'n'; + static final int MASKED_OTHER_CHAR = -1; + static final int MASKED_NUMBER = 1; + static final int MASKED_DAY_COMPONENT_VAL = 1; + static final int MASKED_MONTH_COMPONENT_VAL = 0; + static final int MASKED_YEAR_COMPONENT_VAL = 0; + static final int UNMASKED_VAL = -1; + + int maskedUpperChar = MASKED_UPPERCASE; + int maskedLowerChar = MASKED_LOWERCASE; + int maskedDigitChar = MASKED_DIGIT; + int maskedOtherChar = MASKED_OTHER_CHAR; + int maskedNumber = MASKED_NUMBER; + int maskedDayValue = MASKED_DAY_COMPONENT_VAL; + int maskedMonthValue = MASKED_MONTH_COMPONENT_VAL; + int maskedYearValue = MASKED_YEAR_COMPONENT_VAL; + + private final int colNum; + + public UDFMaskBaseVector(int colNum, int outputColumnNum) { + super(outputColumnNum); + this.colNum = colNum; + } + + public UDFMaskBaseVector() { + super(); + + // Dummy final assignments. + colNum = -1; + } + + public int getColNum() { + return colNum; + } + + public abstract void transform(ColumnVector outputColVector, ColumnVector inputVector, int idx); + + @Override + public void evaluate(VectorizedRowBatch batch) throws HiveException { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + int batchSize = batch.size; + + if(batchSize <=0) { + return; + } + + ColumnVector inputColVector = batch.cols[colNum]; + ColumnVector outputColVector = batch.cols[outputColumnNum]; + + outputColVector.noNulls = inputColVector.noNulls; + + if(inputColVector.isRepeating) { + outputColVector.isRepeating = true; + if(inputColVector.noNulls || !inputColVector.isNull[0]) { + // no NULLs + transform(outputColVector, inputColVector, 0); + outputColVector.noNulls = true; + } else { + // all NULLs + outputColVector.isNull[0] = true; + outputColVector.noNulls = false; + } + } + if(batch.selectedInUse) { + int[] selectedRows = batch.selected; + for(int i=0; i != batchSize; i++) { + final int offset = selectedRows[i]; + if(inputColVector.noNulls || !inputColVector.isNull[offset]) { + // non-null + transform(outputColVector, inputColVector, offset); + } else { + outputColVector.isNull[i] = true; + } + } + } else { + // all rows are to be processed + for(int i=0; i != batchSize; i++) { + if(inputColVector.noNulls || !inputColVector.isNull[i]) { + // non-null + transform(outputColVector, inputColVector, i); + } else { + outputColVector.isNull[i] = true; + } + } + } + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, colNum); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorDate.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorDate.java new file mode 100644 index 0000000000..dfa647e88a --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorDate.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; + +public class UDFMaskVectorDate extends UDFMaskBaseVector { + + public UDFMaskVectorDate(int colNum, int outputColumnNum) { + super(colNum, outputColumnNum); + } + + public UDFMaskVectorDate() { + super(); + } + + public void transform(ColumnVector outputColVector, ColumnVector inputVector, int idx) { + outputColVector.isNull[idx] = false; + + long inputVal = ((LongColumnVector)inputVector).vector[idx]; + Date inputDate = new Date(); + inputDate.setTimeInMillis(DateWritableV2.daysToMillis((int) inputVal)); + + int actualMonthValue = maskedMonthValue + 1; + int year = maskedYearValue == UNMASKED_VAL ? inputDate.getYear() : maskedYearValue; + int month = maskedMonthValue == UNMASKED_VAL ? inputDate.getMonth() : actualMonthValue; + int day = maskedDayValue == UNMASKED_VAL ? inputDate.getDay() : maskedDayValue; + + ((LongColumnVector) outputColVector).vector[idx] = + DateWritableV2.dateToDays(Date.of(year, month, day)); + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, getColNum()); + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + VectorExpressionDescriptor.Builder b = new VectorExpressionDescriptor.Builder(); + b.setMode(VectorExpressionDescriptor.Mode.PROJECTION) + .setNumArguments(1) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.DATE) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN); + return b.build(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorLong.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorLong.java new file mode 100644 index 0000000000..d32f182976 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorLong.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; +import org.apache.hadoop.hive.serde.serdeConstants; + +public class UDFMaskVectorLong extends UDFMaskBaseVector { + + public UDFMaskVectorLong(int colNum, int outputColumnNum) { + super(colNum, outputColumnNum); + } + + public UDFMaskVectorLong() { + super(); + } + + public void transform(ColumnVector outputColVector, ColumnVector inputVector, int idx) { + boolean isBooleanType = + this.getInputTypeInfos()[0].getTypeName().equals(serdeConstants.BOOLEAN_TYPE_NAME); + + // boolean data type is represented as long and masking is not supported for boolean + if(isBooleanType) { + outputColVector.isNull[idx] = true; + outputColVector.noNulls = false; + return; + } + outputColVector.isNull[idx] = false; + + long[] inputCol = ((LongColumnVector)inputVector).vector; + long[] outputCol = ((LongColumnVector)outputColVector).vector; + + long value = inputCol[idx]; + long val = value; + + if(value < 0) { + val *= -1; + } + + long ret = 0; + long pos = 1; + for(int i = 0; val != 0; i++) { + ret += maskedNumber * pos; + + val /= 10; + pos *= 10; + } + + if(value < 0) { + ret *= -1; + } + outputCol[idx] = ret; + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, getColNum()); + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + VectorExpressionDescriptor.Builder b = new VectorExpressionDescriptor.Builder(); + b.setMode(VectorExpressionDescriptor.Mode.PROJECTION) + .setNumArguments(1) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.INT_FAMILY) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN); + return b.build(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorString.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorString.java new file mode 100644 index 0000000000..28e69609dc --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/UDFMaskVectorString.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import java.nio.charset.Charset; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; + +public class UDFMaskVectorString extends UDFMaskBaseVector { + + public UDFMaskVectorString(int colNum, int outputColumnNum) { + super(colNum, outputColumnNum); + } + + public UDFMaskVectorString() { + super(); + } + + + protected int transformChar(final int c) { + switch(Character.getType(c)) { + case Character.UPPERCASE_LETTER: + if(maskedUpperChar != UNMASKED_VAL) { + return maskedUpperChar; + } + break; + + case Character.LOWERCASE_LETTER: + if(maskedLowerChar != UNMASKED_VAL) { + return maskedLowerChar; + } + break; + + case Character.DECIMAL_DIGIT_NUMBER: + if(maskedDigitChar != UNMASKED_VAL) { + return maskedDigitChar; + } + break; + + default: + if(maskedOtherChar != UNMASKED_VAL) { + return maskedOtherChar; + } + break; + } + + return c; + } + + public void transform(ColumnVector outputColVector, ColumnVector inputVector, int idx) { + outputColVector.isNull[idx] = false; + + BytesColumnVector inColVector = ((BytesColumnVector)inputVector); + BytesColumnVector outColVector = ((BytesColumnVector)outputColVector); + + byte[] inputBuffer = inColVector.vector[idx]; + int[] offsets = inColVector.start; + int[] length = inColVector.length; + + String inputStr = new String(inputBuffer, offsets[idx], length[idx], Charset.forName("UTF-8")); + + StringBuilder ret = new StringBuilder(inputStr.length()); + + + for(int i = 0; i < inputStr.length(); i++) { + ret.appendCodePoint(transformChar(inputStr.charAt(i))); + } + + String outStr = ret.toString(); + outColVector.setVal(idx, outStr.getBytes(Charset.forName("UTF-8")), 0, outStr.length()); + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, getColNum()); + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + VectorExpressionDescriptor.Builder b = new VectorExpressionDescriptor.Builder(); + b.setMode(VectorExpressionDescriptor.Mode.PROJECTION) + .setNumArguments(1) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.STRING_FAMILY) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN); + return b.build(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFMask.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFMask.java index 27c3bf8aa6..a409403b62 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFMask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFMask.java @@ -21,6 +21,8 @@ import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.vector.*; +import org.apache.hadoop.hive.ql.exec.vector.expressions.*; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -47,6 +49,7 @@ + " monthValue - value to replace month field in a date with. Specify -1 to retain original value. Valid values: 0-11. Default value: 0\n " + " yearValue - value to replace year field in a date with. Specify -1 to retain original value. Default value: 0\n " ) +@VectorizedExpressions({ UDFMaskVectorString.class, UDFMaskVectorLong.class, UDFMaskVectorDate.class}) public class GenericUDFMask extends BaseMaskUDF { public static final String UDF_NAME = "mask"; diff --git a/ql/src/test/queries/clientpositive/udf_mask_vectorized.q b/ql/src/test/queries/clientpositive/udf_mask_vectorized.q new file mode 100644 index 0000000000..4d27e6aa16 --- /dev/null +++ b/ql/src/test/queries/clientpositive/udf_mask_vectorized.q @@ -0,0 +1,55 @@ +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=minimal; + +create table tmask(t tinyint, s smallint, i int, bg bigint, f float, db double, dc decimal (10,3), n numeric, + d date, ts timestamp, + str string, vr varchar(10), ch char(4), + b boolean, bin binary); + +insert into tmask values(1,2,345,4455433,5.6,5644.455,10.20, 579.00, '2019-09-09', current_timestamp(), 'string1', 'varchar1', 'ch1', true, 'bin'), + (NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL), + (9,7,3450,7455433,5.08,5944.455,10.20, 579.00, '1019-09-09', current_timestamp(), 'string2', 'varchar2', 'ch2', false, 'bin2'), + (NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); + + +-- MASK UDF with single argument +-- date types (timestamp is not supported for masking) +set hive.vectorized.execution.enabled=true; +explain VECTORIZATION DETAIL select mask(d), mask(ts) from tmask where s > 0 and i < 10000000; +select mask(d), mask(ts) from tmask where s > 0 and i < 10000000; +set hive.vectorized.execution.enabled=false; +select mask(d), mask(ts) from tmask where s > 0 and i < 10000000; + +-- numeric types, double, float, demical and numeric are not supported for masking +set hive.vectorized.execution.enabled=true; +explain VECTORIZATION DETAIL select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask; +select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask; +set hive.vectorized.execution.enabled=false; +select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask; + +-- string + misc types +set hive.vectorized.execution.enabled=true; +explain VECTORIZATION DETAIL select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask ; +select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask ; +set hive.vectorized.execution.enabled=false; +select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask ; + +create temporary table tmask_temp(t date, s string, i string, bg string, f string, db string, dc string, n string, + d string, ts string, + str string, vr string, ch string, + b string, bin string); +set hive.vectorized.execution.enabled=true; +insert into tmask_temp select mask(d), mask(ts), mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n), + mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask ; +select count(*) from tmask_temp group by (t,s,i,bg,f,db,dc,n,d,ts,str,vr,ch,b,bin); +set hive.vectorized.execution.enabled=false; +insert into tmask_temp select mask(d), mask(ts), mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n), + mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask ; + +set hive.vectorized.execution.enabled=true; +-- should be double the above select count +select count(*) from tmask_temp group by (t,s,i,bg,f,db,dc,n,d,ts,str,vr,ch,b,bin); + +DROP TABLE tmask; diff --git a/ql/src/test/results/clientpositive/llap/udf_mask_vectorized.q.out b/ql/src/test/results/clientpositive/llap/udf_mask_vectorized.q.out new file mode 100644 index 0000000000..f6c9068e1e --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/udf_mask_vectorized.q.out @@ -0,0 +1,429 @@ +PREHOOK: query: create table tmask(t tinyint, s smallint, i int, bg bigint, f float, db double, dc decimal (10,3), n numeric, + d date, ts timestamp, + str string, vr varchar(10), ch char(4), + b boolean, bin binary) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tmask +POSTHOOK: query: create table tmask(t tinyint, s smallint, i int, bg bigint, f float, db double, dc decimal (10,3), n numeric, + d date, ts timestamp, + str string, vr varchar(10), ch char(4), + b boolean, bin binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tmask +PREHOOK: query: insert into tmask values(1,2,345,4455433,5.6,5644.455,10.20, 579.00, '2019-09-09', current_timestamp(), 'string1', 'varchar1', 'ch1', true, 'bin'), + (NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL), + (9,7,3450,7455433,5.08,5944.455,10.20, 579.00, '1019-09-09', current_timestamp(), 'string2', 'varchar2', 'ch2', false, 'bin2'), + (NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tmask +POSTHOOK: query: insert into tmask values(1,2,345,4455433,5.6,5644.455,10.20, 579.00, '2019-09-09', current_timestamp(), 'string1', 'varchar1', 'ch1', true, 'bin'), + (NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL), + (9,7,3450,7455433,5.08,5944.455,10.20, 579.00, '1019-09-09', current_timestamp(), 'string2', 'varchar2', 'ch2', false, 'bin2'), + (NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tmask +POSTHOOK: Lineage: tmask.b SCRIPT [] +POSTHOOK: Lineage: tmask.bg SCRIPT [] +POSTHOOK: Lineage: tmask.bin SCRIPT [] +POSTHOOK: Lineage: tmask.ch SCRIPT [] +POSTHOOK: Lineage: tmask.d SCRIPT [] +POSTHOOK: Lineage: tmask.db SCRIPT [] +POSTHOOK: Lineage: tmask.dc SCRIPT [] +POSTHOOK: Lineage: tmask.f SCRIPT [] +POSTHOOK: Lineage: tmask.i SCRIPT [] +POSTHOOK: Lineage: tmask.n SCRIPT [] +POSTHOOK: Lineage: tmask.s SCRIPT [] +POSTHOOK: Lineage: tmask.str SCRIPT [] +POSTHOOK: Lineage: tmask.t SCRIPT [] +POSTHOOK: Lineage: tmask.ts SCRIPT [] +POSTHOOK: Lineage: tmask.vr SCRIPT [] +PREHOOK: query: explain VECTORIZATION DETAIL select mask(d), mask(ts) from tmask where s > 0 and i < 10000000 +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: explain VECTORIZATION DETAIL select mask(d), mask(ts) from tmask where s > 0 and i < 10000000 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tmask + filterExpr: ((s > 0S) and (i < 10000000)) (type: boolean) + Statistics: Num rows: 4 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:t:tinyint, 1:s:smallint, 2:i:int, 3:bg:bigint, 4:f:float, 5:db:double, 6:dc:decimal(10,3)/DECIMAL_64, 7:n:decimal(10,0)/DECIMAL_64, 8:d:date, 9:ts:timestamp, 10:str:string, 11:vr:varchar(10), 12:ch:char(4), 13:b:boolean, 14:bin:binary, 15:ROW__ID:struct] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: FilterExprAndExpr(children: FilterLongColGreaterLongScalar(col 1:smallint, val 0), FilterLongColLessLongScalar(col 2:int, val 10000000)) + predicate: ((i < 10000000) and (s > 0S)) (type: boolean) + Statistics: Num rows: 4 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: mask(d) (type: date), mask(ts) (type: timestamp) + outputColumnNames: _col0, _col1 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [16, 17] + selectExpressions: UDFMaskVectorDate(col 8:date) -> 16:date, VectorUDFAdaptor(mask(ts)) -> 17:timestamp + Statistics: Num rows: 4 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 4 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: true + vectorized: true + rowBatchContext: + dataColumnCount: 15 + includeColumns: [1, 2, 8, 9] + dataColumns: t:tinyint, s:smallint, i:int, bg:bigint, f:float, db:double, dc:decimal(10,3)/DECIMAL_64, n:decimal(10,0)/DECIMAL_64, d:date, ts:timestamp, str:string, vr:varchar(10), ch:char(4), b:boolean, bin:binary + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, timestamp] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select mask(d), mask(ts) from tmask where s > 0 and i < 10000000 +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: select mask(d), mask(ts) from tmask where s > 0 and i < 10000000 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +0001-01-01 NULL +0001-01-01 NULL +PREHOOK: query: select mask(d), mask(ts) from tmask where s > 0 and i < 10000000 +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: select mask(d), mask(ts) from tmask where s > 0 and i < 10000000 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +0001-01-01 NULL +0001-01-01 NULL +PREHOOK: query: explain VECTORIZATION DETAIL select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: explain VECTORIZATION DETAIL select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tmask + Statistics: Num rows: 4 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:t:tinyint, 1:s:smallint, 2:i:int, 3:bg:bigint, 4:f:float, 5:db:double, 6:dc:decimal(10,3)/DECIMAL_64, 7:n:decimal(10,0)/DECIMAL_64, 8:d:date, 9:ts:timestamp, 10:str:string, 11:vr:varchar(10), 12:ch:char(4), 13:b:boolean, 14:bin:binary, 15:ROW__ID:struct] + Select Operator + expressions: mask(t) (type: tinyint), mask(s) (type: smallint), mask(i) (type: int), mask(bg) (type: bigint), mask(f) (type: float), mask(db) (type: double), mask(dc) (type: decimal(38,18)), mask(n) (type: decimal(38,18)) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [16, 17, 18, 19, 20, 21, 22, 23] + selectExpressions: UDFMaskVectorLong(col 0:tinyint) -> 16:tinyint, UDFMaskVectorLong(col 1:smallint) -> 17:smallint, UDFMaskVectorLong(col 2:int) -> 18:int, UDFMaskVectorLong(col 3:bigint) -> 19:bigint, VectorUDFAdaptor(mask(f)) -> 20:float, VectorUDFAdaptor(mask(db)) -> 21:double, VectorUDFAdaptor(mask(dc)) -> 22:decimal(38,18), VectorUDFAdaptor(mask(n)) -> 23:decimal(38,18) + Statistics: Num rows: 4 Data size: 1024 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 4 Data size: 1024 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: true + vectorized: true + rowBatchContext: + dataColumnCount: 15 + includeColumns: [0, 1, 2, 3, 4, 5, 6, 7] + dataColumns: t:tinyint, s:smallint, i:int, bg:bigint, f:float, db:double, dc:decimal(10,3)/DECIMAL_64, n:decimal(10,0)/DECIMAL_64, d:date, ts:timestamp, str:string, vr:varchar(10), ch:char(4), b:boolean, bin:binary + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, bigint, bigint, bigint, double, double, decimal(38,18), decimal(38,18)] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +1 1 111 1111111 NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL +1 1 1111 1111111 NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL +PREHOOK: query: select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: select mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +1 1 111 1111111 NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL +1 1 1111 1111111 NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL +PREHOOK: query: explain VECTORIZATION DETAIL select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: explain VECTORIZATION DETAIL select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tmask + Statistics: Num rows: 4 Data size: 942 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:t:tinyint, 1:s:smallint, 2:i:int, 3:bg:bigint, 4:f:float, 5:db:double, 6:dc:decimal(10,3)/DECIMAL_64, 7:n:decimal(10,0)/DECIMAL_64, 8:d:date, 9:ts:timestamp, 10:str:string, 11:vr:varchar(10), 12:ch:char(4), 13:b:boolean, 14:bin:binary, 15:ROW__ID:struct] + Select Operator + expressions: mask(str) (type: string), mask(vr) (type: varchar(65535)), mask(ch) (type: char(255)), mask(b) (type: boolean), mask(bin) (type: binary) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [16, 17, 18, 19, 20] + selectExpressions: UDFMaskVectorString(col 10:string) -> 16:string, UDFMaskVectorString(col 11:varchar(10)) -> 17:varchar(65535), UDFMaskVectorString(col 12:char(4)) -> 18:char(255), UDFMaskVectorLong(col 13:boolean) -> 19:boolean, VectorUDFAdaptor(mask(bin)) -> 20:binary + Statistics: Num rows: 4 Data size: 265160 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 4 Data size: 265160 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: true + vectorized: true + rowBatchContext: + dataColumnCount: 15 + includeColumns: [10, 11, 12, 13, 14] + dataColumns: t:tinyint, s:smallint, i:int, bg:bigint, f:float, db:double, dc:decimal(10,3)/DECIMAL_64, n:decimal(10,0)/DECIMAL_64, d:date, ts:timestamp, str:string, vr:varchar(10), ch:char(4), b:boolean, bin:binary + partitionColumnCount: 0 + scratchColumnTypeNames: [string, string, string, bigint, string] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +xxxxxxn xxxxxxxn xxn NULL NULL +NULL NULL NULL NULL NULL +xxxxxxn xxxxxxxn xxn NULL NULL +NULL NULL NULL NULL NULL +PREHOOK: query: select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +#### A masked pattern was here #### +POSTHOOK: query: select mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +#### A masked pattern was here #### +xxxxxxn xxxxxxxn xxn NULL NULL +NULL NULL NULL NULL NULL +xxxxxxn xxxxxxxn xxn NULL NULL +NULL NULL NULL NULL NULL +PREHOOK: query: create temporary table tmask_temp(t date, s string, i string, bg string, f string, db string, dc string, n string, + d string, ts string, + str string, vr string, ch string, + b string, bin string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tmask_temp +POSTHOOK: query: create temporary table tmask_temp(t date, s string, i string, bg string, f string, db string, dc string, n string, + d string, ts string, + str string, vr string, ch string, + b string, bin string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tmask_temp +PREHOOK: query: insert into tmask_temp select mask(d), mask(ts), mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n), + mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +PREHOOK: Output: default@tmask_temp +POSTHOOK: query: insert into tmask_temp select mask(d), mask(ts), mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n), + mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +POSTHOOK: Output: default@tmask_temp +POSTHOOK: Lineage: tmask_temp.b EXPRESSION [(tmask)tmask.FieldSchema(name:b, type:boolean, comment:null), ] +POSTHOOK: Lineage: tmask_temp.bg EXPRESSION [(tmask)tmask.FieldSchema(name:s, type:smallint, comment:null), ] +POSTHOOK: Lineage: tmask_temp.bin EXPRESSION [(tmask)tmask.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: tmask_temp.ch EXPRESSION [(tmask)tmask.FieldSchema(name:ch, type:char(4), comment:null), ] +POSTHOOK: Lineage: tmask_temp.d EXPRESSION [(tmask)tmask.FieldSchema(name:dc, type:decimal(10,3), comment:null), ] +POSTHOOK: Lineage: tmask_temp.db EXPRESSION [(tmask)tmask.FieldSchema(name:bg, type:bigint, comment:null), ] +POSTHOOK: Lineage: tmask_temp.dc EXPRESSION [(tmask)tmask.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: tmask_temp.f EXPRESSION [(tmask)tmask.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: tmask_temp.i EXPRESSION [(tmask)tmask.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: tmask_temp.n EXPRESSION [(tmask)tmask.FieldSchema(name:db, type:double, comment:null), ] +POSTHOOK: Lineage: tmask_temp.s EXPRESSION [(tmask)tmask.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: tmask_temp.str EXPRESSION [(tmask)tmask.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: tmask_temp.t EXPRESSION [(tmask)tmask.FieldSchema(name:d, type:date, comment:null), ] +POSTHOOK: Lineage: tmask_temp.ts EXPRESSION [(tmask)tmask.FieldSchema(name:n, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: tmask_temp.vr EXPRESSION [(tmask)tmask.FieldSchema(name:vr, type:varchar(10), comment:null), ] +PREHOOK: query: select count(*) from tmask_temp group by (t,s,i,bg,f,db,dc,n,d,ts,str,vr,ch,b,bin) +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask_temp +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tmask_temp group by (t,s,i,bg,f,db,dc,n,d,ts,str,vr,ch,b,bin) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask_temp +#### A masked pattern was here #### +1 +2 +1 +PREHOOK: query: insert into tmask_temp select mask(d), mask(ts), mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n), + mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask +PREHOOK: Output: default@tmask_temp +POSTHOOK: query: insert into tmask_temp select mask(d), mask(ts), mask(t), mask(s), mask(i), mask(bg), mask(f), mask(db), mask(dc), mask(n), + mask(str), mask(vr), mask(ch), mask(b), mask(bin) from tmask +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask +POSTHOOK: Output: default@tmask_temp +POSTHOOK: Lineage: tmask_temp.b EXPRESSION [(tmask)tmask.FieldSchema(name:b, type:boolean, comment:null), ] +POSTHOOK: Lineage: tmask_temp.bg EXPRESSION [(tmask)tmask.FieldSchema(name:s, type:smallint, comment:null), ] +POSTHOOK: Lineage: tmask_temp.bin EXPRESSION [(tmask)tmask.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: tmask_temp.ch EXPRESSION [(tmask)tmask.FieldSchema(name:ch, type:char(4), comment:null), ] +POSTHOOK: Lineage: tmask_temp.d EXPRESSION [(tmask)tmask.FieldSchema(name:dc, type:decimal(10,3), comment:null), ] +POSTHOOK: Lineage: tmask_temp.db EXPRESSION [(tmask)tmask.FieldSchema(name:bg, type:bigint, comment:null), ] +POSTHOOK: Lineage: tmask_temp.dc EXPRESSION [(tmask)tmask.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: tmask_temp.f EXPRESSION [(tmask)tmask.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: tmask_temp.i EXPRESSION [(tmask)tmask.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: tmask_temp.n EXPRESSION [(tmask)tmask.FieldSchema(name:db, type:double, comment:null), ] +POSTHOOK: Lineage: tmask_temp.s EXPRESSION [(tmask)tmask.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: tmask_temp.str EXPRESSION [(tmask)tmask.FieldSchema(name:str, type:string, comment:null), ] +POSTHOOK: Lineage: tmask_temp.t EXPRESSION [(tmask)tmask.FieldSchema(name:d, type:date, comment:null), ] +POSTHOOK: Lineage: tmask_temp.ts EXPRESSION [(tmask)tmask.FieldSchema(name:n, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: tmask_temp.vr EXPRESSION [(tmask)tmask.FieldSchema(name:vr, type:varchar(10), comment:null), ] +PREHOOK: query: select count(*) from tmask_temp group by (t,s,i,bg,f,db,dc,n,d,ts,str,vr,ch,b,bin) +PREHOOK: type: QUERY +PREHOOK: Input: default@tmask_temp +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tmask_temp group by (t,s,i,bg,f,db,dc,n,d,ts,str,vr,ch,b,bin) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tmask_temp +#### A masked pattern was here #### +2 +4 +2 +PREHOOK: query: DROP TABLE tmask +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@tmask +PREHOOK: Output: default@tmask +POSTHOOK: query: DROP TABLE tmask +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@tmask +POSTHOOK: Output: default@tmask