diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 0880a96..7edcd98 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2977,7 +2977,12 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Which vectorized input format support features are enabled for vectorization.\n" + "That is, if a VectorizedInputFormat input format does support \"decimal_64\" for example\n" + "this variable must enable that to be used in vectorization"), - + HIVE_VECTORIZED_IF_EXPR_MODE("hive.vectorized.if.expr.mode", "better", new StringSet("adaptor", "good", "better"), + "Specifies the extent to which SQL IF statements will be vectorized.\n" + + "0. adaptor: only use the VectorUDFAdaptor to execute IF statements\n" + + "1. good : use vectorized expression classes get good performance\n" + + "2. better : use vectorized expressions that conditionally execute THEN/ELSE\n" + + " expressions for better performance.\n"), HIVE_TEST_VECTORIZATION_ENABLED_OVERRIDE("hive.test.vectorized.execution.enabled.override", "none", new StringSet("none", "enable", "disable"), "internal use only, used to override the hive.vectorized.execution.enabled setting and\n" + diff --git data/files/student_10_lines data/files/student_10_lines new file mode 100644 index 0000000..2f1b331 --- /dev/null +++ data/files/student_10_lines @@ -0,0 +1,10 @@ +tom thompson420.53 +luke king280.47 +priscilla falkner551.16 +luke brown601.14 +ulysses garcia352.74 +calvin brown282.70 +oscar thompson352.98 +xavier garcia331.06 +nick johnson34 +quinn ovid19 diff --git data/files/student_2_lines data/files/student_2_lines deleted file mode 100644 index 9e86836..0000000 --- data/files/student_2_lines +++ /dev/null @@ -1,2 +0,0 @@ -tom thompson420.53 -luke king280.47 diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 4df6e97..4151fb7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -133,6 +133,21 @@ public static HiveVectorAdaptorUsageMode getHiveConfValue(HiveConf hiveConf) { } private HiveVectorAdaptorUsageMode hiveVectorAdaptorUsageMode; + + public enum HiveVectorIfStmtMode { + ADAPTOR, + GOOD, + BETTER; + + public static HiveVectorIfStmtMode getHiveConfValue(HiveConf hiveConf) { + String string = HiveConf.getVar(hiveConf, + HiveConf.ConfVars.HIVE_VECTORIZED_IF_EXPR_MODE); + return valueOf(string.toUpperCase()); + } + } + + private HiveVectorIfStmtMode hiveVectorIfStmtMode; + //when set to true use the overflow checked vector expressions private boolean useCheckedVectorExpressions; @@ -141,6 +156,7 @@ public static HiveVectorAdaptorUsageMode getHiveConfValue(HiveConf hiveConf) { private void setHiveConfVars(HiveConf hiveConf) { hiveVectorAdaptorUsageMode = HiveVectorAdaptorUsageMode.getHiveConfValue(hiveConf); + hiveVectorIfStmtMode = HiveVectorIfStmtMode.getHiveConfValue(hiveConf); this.reuseScratchColumns = HiveConf.getBoolVar(hiveConf, ConfVars.HIVE_VECTORIZATION_TESTING_REUSE_SCRATCH_COLUMNS); this.ocm.setReuseColumns(reuseScratchColumns); @@ -150,6 +166,7 @@ private void setHiveConfVars(HiveConf hiveConf) { private void copyHiveConfVars(VectorizationContext vContextEnvironment) { hiveVectorAdaptorUsageMode = vContextEnvironment.hiveVectorAdaptorUsageMode; + hiveVectorIfStmtMode = vContextEnvironment.hiveVectorIfStmtMode; this.reuseScratchColumns = vContextEnvironment.reuseScratchColumns; this.ocm.setReuseColumns(reuseScratchColumns); } @@ -1978,6 +1995,8 @@ private VectorExpression getGenericUdfVectorExpression(GenericUDF udf, ve = getBetweenFilterExpression(childExpr, mode, returnType); } else if (udf instanceof GenericUDFIn) { ve = getInExpression(childExpr, mode, returnType); + } else if (udf instanceof GenericUDFIf) { + ve = getIfExpression((GenericUDFIf) udf, childExpr, mode, returnType); } else if (udf instanceof GenericUDFWhen) { ve = getWhenExpression(childExpr, mode, returnType); } else if (udf instanceof GenericUDFOPPositive) { @@ -2911,38 +2930,49 @@ private VectorExpression getBetweenFilterExpression(List childExpr return createVectorExpression(cl, childrenAfterNot, VectorExpressionDescriptor.Mode.PROJECTION, returnType); } + private boolean isCondExpr(ExprNodeDesc exprNodeDesc) { + if (exprNodeDesc instanceof ExprNodeConstantDesc || + exprNodeDesc instanceof ExprNodeColumnDesc) { + return false; + } + return true; // Requires conditional evaluation for good performance. + } + private boolean isNullConst(ExprNodeDesc exprNodeDesc) { //null constant could be typed so we need to check the value if (exprNodeDesc instanceof ExprNodeConstantDesc && ((ExprNodeConstantDesc) exprNodeDesc).getValue() == null) { - return true; + return true; } return false; } - private VectorExpression getWhenExpression(List childExpr, + private VectorExpression getIfExpression(GenericUDFIf genericUDFIf, List childExpr, VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException { if (mode != VectorExpressionDescriptor.Mode.PROJECTION) { return null; } - final int size = childExpr.size(); - final ExprNodeDesc whenDesc = childExpr.get(0); - final ExprNodeDesc thenDesc = childExpr.get(1); - final ExprNodeDesc elseDesc; + // Add HiveConf variable with 3 modes: + // 1) adaptor: Always use VectorUDFAdaptor for IF statements. + // + // 2) good: Vectorize but don't optimize conditional expressions + // + // 3) better: Vectorize and Optimize conditional expressions. + // - if (size == 2) { - elseDesc = new ExprNodeConstantDesc(returnType, null); - } else if (size == 3) { - elseDesc = childExpr.get(2); - } else { - final GenericUDFWhen udfWhen = new GenericUDFWhen(); - elseDesc = new ExprNodeGenericFuncDesc(returnType, udfWhen, udfWhen.getUdfName(), - childExpr.subList(2, childExpr.size())); + if (hiveVectorIfStmtMode == HiveVectorIfStmtMode.ADAPTOR) { + return null; } - if (isNullConst(thenDesc) && isNullConst(elseDesc)) { + final ExprNodeDesc ifDesc = childExpr.get(0); + final ExprNodeDesc thenDesc = childExpr.get(1); + final ExprNodeDesc elseDesc = childExpr.get(2); + + final boolean isThenNullConst = isNullConst(thenDesc); + final boolean isElseNullConst = isNullConst(elseDesc); + if (isThenNullConst && isElseNullConst) { // THEN NULL ELSE NULL: An unusual "case", but possible. final int outputColumnNum = ocm.allocateOutputColumn(returnType); @@ -2956,17 +2986,32 @@ private VectorExpression getWhenExpression(List childExpr, return resultExpr; } - if (isNullConst(thenDesc)) { - final VectorExpression whenExpr = getVectorExpression(whenDesc, mode); + + final boolean isThenCondExpr = isCondExpr(thenDesc); + final boolean isElseCondExpr = isCondExpr(elseDesc); + + final boolean isOnlyGood = (hiveVectorIfStmtMode == HiveVectorIfStmtMode.GOOD); + + if (isThenNullConst) { + final VectorExpression whenExpr = getVectorExpression(ifDesc, mode); final VectorExpression elseExpr = getVectorExpression(elseDesc, mode); final int outputColumnNum = ocm.allocateOutputColumn(returnType); - final VectorExpression resultExpr = - new IfExprNullColumn( - whenExpr.getOutputColumnNum(), - elseExpr.getOutputColumnNum(), - outputColumnNum); + final VectorExpression resultExpr; + if (!isElseCondExpr || isOnlyGood) { + resultExpr = + new IfExprNullColumn( + whenExpr.getOutputColumnNum(), + elseExpr.getOutputColumnNum(), + outputColumnNum); + } else { + resultExpr = + new IfExprNullCondExpr( + whenExpr.getOutputColumnNum(), + elseExpr.getOutputColumnNum(), + outputColumnNum); + } resultExpr.setChildExpressions(new VectorExpression[] {whenExpr, elseExpr}); @@ -2984,17 +3029,27 @@ private VectorExpression getWhenExpression(List childExpr, return resultExpr; } - if (isNullConst(elseDesc)) { - final VectorExpression whenExpr = getVectorExpression(whenDesc, mode); + + if (isElseNullConst) { + final VectorExpression whenExpr = getVectorExpression(ifDesc, mode); final VectorExpression thenExpr = getVectorExpression(thenDesc, mode); final int outputColumnNum = ocm.allocateOutputColumn(returnType); - final VectorExpression resultExpr = - new IfExprColumnNull( - whenExpr.getOutputColumnNum(), - thenExpr.getOutputColumnNum(), - outputColumnNum); + final VectorExpression resultExpr; + if (!isThenCondExpr || isOnlyGood) { + resultExpr = + new IfExprColumnNull( + whenExpr.getOutputColumnNum(), + thenExpr.getOutputColumnNum(), + outputColumnNum); + } else { + resultExpr = + new IfExprCondExprNull( + whenExpr.getOutputColumnNum(), + thenExpr.getOutputColumnNum(), + outputColumnNum); + } resultExpr.setChildExpressions(new VectorExpression[] {whenExpr, thenExpr}); @@ -3012,11 +3067,87 @@ private VectorExpression getWhenExpression(List childExpr, return resultExpr; } + + if ((isThenCondExpr || isElseCondExpr) && !isOnlyGood) { + final VectorExpression whenExpr = getVectorExpression(ifDesc, mode); + final VectorExpression thenExpr = getVectorExpression(thenDesc, mode); + final VectorExpression elseExpr = getVectorExpression(elseDesc, mode); + + final int outputColumnNum = ocm.allocateOutputColumn(returnType); + + final VectorExpression resultExpr; + if (isThenCondExpr && isElseCondExpr) { + resultExpr = + new IfExprCondExprCondExpr( + whenExpr.getOutputColumnNum(), + thenExpr.getOutputColumnNum(), + elseExpr.getOutputColumnNum(), + outputColumnNum); + } else if (isThenCondExpr) { + resultExpr = + new IfExprCondExprColumn( + whenExpr.getOutputColumnNum(), + thenExpr.getOutputColumnNum(), + elseExpr.getOutputColumnNum(), + outputColumnNum); + } else { + resultExpr = + new IfExprColumnCondExpr( + whenExpr.getOutputColumnNum(), + thenExpr.getOutputColumnNum(), + elseExpr.getOutputColumnNum(), + outputColumnNum); + } + + resultExpr.setChildExpressions(new VectorExpression[] {whenExpr, thenExpr, elseExpr}); + + resultExpr.setInputTypeInfos( + whenExpr.getOutputTypeInfo(), + thenExpr.getOutputTypeInfo(), + elseExpr.getOutputTypeInfo()); + resultExpr.setInputDataTypePhysicalVariations( + whenExpr.getOutputDataTypePhysicalVariation(), + thenExpr.getOutputDataTypePhysicalVariation(), + elseExpr.getOutputDataTypePhysicalVariation()); + + resultExpr.setOutputTypeInfo(returnType); + resultExpr.setOutputDataTypePhysicalVariation(DataTypePhysicalVariation.NONE); + + return resultExpr; + } + + Class udfClass = genericUDFIf.getClass(); + return getVectorExpressionForUdf( + genericUDFIf, udfClass, childExpr, mode, returnType); + } + + private VectorExpression getWhenExpression(List childExpr, + VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException { + + if (mode != VectorExpressionDescriptor.Mode.PROJECTION) { + return null; + } + final int size = childExpr.size(); + + final ExprNodeDesc whenDesc = childExpr.get(0); + final ExprNodeDesc thenDesc = childExpr.get(1); + final ExprNodeDesc elseDesc; + + if (size == 2) { + elseDesc = new ExprNodeConstantDesc(returnType, null); + } else if (size == 3) { + elseDesc = childExpr.get(2); + } else { + final GenericUDFWhen udfWhen = new GenericUDFWhen(); + elseDesc = new ExprNodeGenericFuncDesc(returnType, udfWhen, udfWhen.getUdfName(), + childExpr.subList(2, childExpr.size())); + } + + // Transform CASE WHEN with just a THEN/ELSE into an IF statement. final GenericUDFIf genericUDFIf = new GenericUDFIf(); - final List ifChildExpr = Arrays.asList(whenDesc, thenDesc, elseDesc); - final ExprNodeGenericFuncDesc exprNodeDesc = - new ExprNodeGenericFuncDesc(returnType, genericUDFIf, "if", ifChildExpr); - return getVectorExpression(exprNodeDesc, mode); + final List ifChildExpr = + Arrays.asList(whenDesc, thenDesc, elseDesc); + return getIfExpression(genericUDFIf, ifChildExpr, mode, returnType); } /* diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java new file mode 100644 index 0000000..c18f102 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Do regular execution of the THEN vector expression (a column or scalar) and conditional execution + * of the ELSE vector expression of a SQL IF statement. + */ +public class IfExprColumnCondExpr extends IfExprCondExprBase { + private static final long serialVersionUID = 1L; + + protected final int arg2Column; + protected final int arg3Column; + + public IfExprColumnCondExpr(int arg1Column, int arg2Column, int arg3Column, + int outputColumnNum) { + super(arg1Column, outputColumnNum); + this.arg2Column = arg2Column; + this.arg3Column = arg3Column; + } + + public IfExprColumnCondExpr() { + super(); + + // Dummy final assignments. + arg2Column = -1; + arg3Column = -1; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + int n = batch.size; + if (n <= 0) { + // Nothing to do + return; + } + + /* + * Do common analysis of the IF statement boolean expression. + * + * The following protected members can be examined afterwards: + * + * boolean isIfStatementResultRepeated + * boolean isIfStatementResultThen + * + * int thenSelectedCount + * int[] thenSelected + * int elseSelectedCount + * int[] elseSelected + */ + super.evaluate(batch); + + ColumnVector outputColVector = batch.cols[outputColumnNum]; + boolean[] outputIsNull = outputColVector.isNull; + + // We do not need to do a column reset since we are carefully changing the output. + outputColVector.isRepeating = false; + + // CONSIDER: Should be do this for all vector expressions that can + // work on BytesColumnVector output columns??? + outputColVector.init(); + + ColumnVector thenColVector = batch.cols[arg2Column]; + ColumnVector elseColVector = batch.cols[arg3Column]; + + final int thenCount = thenSelectedCount; + final int elseCount = elseSelectedCount; + + if (isIfStatementResultRepeated) { + if (isIfStatementResultThen) { + // Evaluate THEN expression (only) and copy all its results. + childExpressions[1].evaluate(batch); + thenColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } else { + // Evaluate ELSE expression (only) and copy all its results. + childExpressions[2].evaluate(batch); + elseColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } + return; + } + + // NOTE: We cannot use copySelected below since it is a whole column operation. + + // The THEN expression is either IdentityExpression (a column) or a ConstantVectorExpression + // (a scalar) and trivial to evaluate. + childExpressions[1].evaluate(batch); + for (int i = 0; i < thenCount; i++) { + final int batchIndex = thenSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, thenColVector); + } + + conditionalEvaluate(batch, childExpressions[2], elseSelected, elseCount); + for (int i = 0; i < elseCount; i++) { + final int batchIndex = elseSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, elseColVector); + } + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, arg1Column) + ", " + getColumnParamString(1, arg2Column) + + getColumnParamString(2, arg3Column); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprBase.java new file mode 100644 index 0000000..80b0ab1 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprBase.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Base class that supports conditional execution of the THEN/ELSE vector expressions of + * a SQL IF statement + */ +public abstract class IfExprCondExprBase extends VectorExpression { + private static final long serialVersionUID = 1L; + + protected final int arg1Column; + + // Whether the IF statement boolean expression was repeating. + protected transient boolean isIfStatementResultRepeated; + protected transient boolean isIfStatementResultThen; + + // The batchIndex for the rows that are for the THEN/ELSE rows respectively. + // Temporary work arrays. + protected transient int thenSelectedCount; + protected transient int[] thenSelected; + protected transient int elseSelectedCount; + protected transient int[] elseSelected; + + public IfExprCondExprBase(int arg1Column, int outputColumnNum) { + super(outputColumnNum); + this.arg1Column = arg1Column; + } + + public IfExprCondExprBase() { + super(); + + // Dummy final assignments. + arg1Column = -1; + } + + public void conditionalEvaluate(VectorizedRowBatch batch, VectorExpression condVecExpr, + int[] condSelected, int condSize) { + + int saveSize = batch.size; + boolean saveSelectedInUse = batch.selectedInUse; + int[] saveSelected = batch.selected; + + batch.size = condSize; + batch.selectedInUse = true; + batch.selected = condSelected; + + condVecExpr.evaluate(batch); + + batch.size = saveSize; + batch.selectedInUse = saveSelectedInUse; + batch.selected = saveSelected; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + // NOTE: We do conditional vector expression so we do not call super.evaluateChildren(batch). + + thenSelectedCount = 0; + elseSelectedCount = 0; + isIfStatementResultRepeated = false; + isIfStatementResultThen = false; // Give it a value. + + int n = batch.size; + if (n <= 0) { + // Nothing to do + return; + } + + // Child #1 is the IF boolean expression. + childExpressions[0].evaluate(batch); + LongColumnVector ifExprColVector = (LongColumnVector) batch.cols[arg1Column]; + if (ifExprColVector.isRepeating) { + isIfStatementResultRepeated = true; + isIfStatementResultThen = + ((ifExprColVector.noNulls || !ifExprColVector.isNull[0]) && + ifExprColVector.vector[0] == 1); + return; + } + + if (thenSelected == null || n > thenSelected.length) { + + // (Re)allocate larger to be a multiple of 1024 (DEFAULT_SIZE). + final int roundUpSize = + ((n + VectorizedRowBatch.DEFAULT_SIZE - 1) / VectorizedRowBatch.DEFAULT_SIZE) + * VectorizedRowBatch.DEFAULT_SIZE; + thenSelected = new int[roundUpSize]; + elseSelected = new int[roundUpSize]; + } + + int[] sel = batch.selected; + long[] vector = ifExprColVector.vector; + + if (ifExprColVector.noNulls) { + if (batch.selectedInUse) { + for (int j = 0; j < n; j++) { + final int i = sel[j]; + if (vector[i] == 1) { + thenSelected[thenSelectedCount++] = i; + } else { + elseSelected[elseSelectedCount++] = i; + } + } + } else { + for (int i = 0; i < n; i++) { + if (vector[i] == 1) { + thenSelected[thenSelectedCount++] = i; + } else { + elseSelected[elseSelectedCount++] = i; + } + } + } + } else { + boolean[] isNull = ifExprColVector.isNull; + if (batch.selectedInUse) { + for (int j = 0; j < n; j++) { + final int i = sel[j]; + if (!isNull[i] && vector[i] == 1) { + thenSelected[thenSelectedCount++] = i; + } else { + elseSelected[elseSelectedCount++] = i; + } + } + } else { + for (int i = 0; i < n; i++) { + if (!isNull[i] && vector[i] == 1) { + thenSelected[thenSelectedCount++] = i; + } else { + elseSelected[elseSelectedCount++] = i; + } + } + } + } + + if (thenSelectedCount == 0) { + isIfStatementResultRepeated = true; + isIfStatementResultThen = false; + } else if (elseSelectedCount == 0) { + isIfStatementResultRepeated = true; + isIfStatementResultThen = true; + } + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + + // Descriptor is not defined because it takes variable number of arguments with different + // data types. + throw new UnsupportedOperationException("Undefined descriptor"); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprColumn.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprColumn.java new file mode 100644 index 0000000..a7ac963 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprColumn.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Do conditional execution of the THEN/ vector expression and regular execution of the ELSE + * vector expression (a column or scalar) of a SQL IF statement. + */ +public class IfExprCondExprColumn extends IfExprCondExprBase { + private static final long serialVersionUID = 1L; + + protected final int arg2Column; + protected final int arg3Column; + + public IfExprCondExprColumn(int arg1Column, int arg2Column, int arg3Column, + int outputColumnNum) { + super(arg1Column, outputColumnNum); + this.arg2Column = arg2Column; + this.arg3Column = arg3Column; + } + + public IfExprCondExprColumn() { + super(); + + // Dummy final assignments. + arg2Column = -1; + arg3Column = -1; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + int n = batch.size; + if (n <= 0) { + // Nothing to do + return; + } + + /* + * Do common analysis of the IF statement boolean expression. + * + * The following protected members can be examined afterwards: + * + * boolean isIfStatementResultRepeated + * boolean isIfStatementResultThen + * + * int thenSelectedCount + * int[] thenSelected + * int elseSelectedCount + * int[] elseSelected + */ + super.evaluate(batch); + + ColumnVector outputColVector = batch.cols[outputColumnNum]; + boolean[] outputIsNull = outputColVector.isNull; + + // We do not need to do a column reset since we are carefully changing the output. + outputColVector.isRepeating = false; + + // CONSIDER: Should be do this for all vector expressions that can + // work on BytesColumnVector output columns??? + outputColVector.init(); + + ColumnVector thenColVector = batch.cols[arg2Column]; + ColumnVector elseColVector = batch.cols[arg3Column]; + + final int thenCount = thenSelectedCount; + final int elseCount = elseSelectedCount; + + if (isIfStatementResultRepeated) { + if (isIfStatementResultThen) { + // Evaluate THEN expression (only) and copy all its results. + childExpressions[1].evaluate(batch); + thenColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } else { + // Evaluate ELSE expression (only) and copy all its results. + childExpressions[2].evaluate(batch); + elseColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } + return; + } + + // NOTE: We cannot use copySelected below since it is a whole column operation. + + conditionalEvaluate(batch, childExpressions[1], thenSelected, thenCount); + for (int i = 0; i < thenCount; i++) { + final int batchIndex = thenSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, thenColVector); + } + + // The ELSE expression is either IdentityExpression (a column) or a ConstantVectorExpression + // (a scalar) and trivial to evaluate. + childExpressions[2].evaluate(batch); + for (int i = 0; i < elseCount; i++) { + final int batchIndex = elseSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, elseColVector); + } + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, arg1Column) + ", " + getColumnParamString(1, arg2Column) + + getColumnParamString(2, arg3Column); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprCondExpr.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprCondExpr.java new file mode 100644 index 0000000..b744795 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprCondExpr.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Do conditional execution of the THEN/ELSE vector expressions of a SQL IF statement. + */ +public class IfExprCondExprCondExpr extends IfExprCondExprBase { + private static final long serialVersionUID = 1L; + + protected final int arg2Column; + protected final int arg3Column; + + public IfExprCondExprCondExpr(int arg1Column, int arg2Column, int arg3Column, + int outputColumnNum) { + super(arg1Column, outputColumnNum); + this.arg2Column = arg2Column; + this.arg3Column = arg3Column; + } + + public IfExprCondExprCondExpr() { + super(); + + // Dummy final assignments. + arg2Column = -1; + arg3Column = -1; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + int n = batch.size; + if (n <= 0) { + // Nothing to do + return; + } + + /* + * Do common analysis of the IF statement boolean expression. + * + * The following protected members can be examined afterwards: + * + * boolean isIfStatementResultRepeated + * boolean isIfStatementResultThen + * + * int thenSelectedCount + * int[] thenSelected + * int elseSelectedCount + * int[] elseSelected + */ + super.evaluate(batch); + + ColumnVector outputColVector = batch.cols[outputColumnNum]; + boolean[] outputIsNull = outputColVector.isNull; + + // We do not need to do a column reset since we are carefully changing the output. + outputColVector.isRepeating = false; + + // CONSIDER: Should be do this for all vector expressions that can + // work on BytesColumnVector output columns??? + outputColVector.init(); + + ColumnVector thenColVector = batch.cols[arg2Column]; + ColumnVector elseColVector = batch.cols[arg3Column]; + + final int thenCount = thenSelectedCount; + final int elseCount = elseSelectedCount; + + if (isIfStatementResultRepeated) { + if (isIfStatementResultThen) { + // Evaluate THEN expression (only) and copy all its results. + childExpressions[1].evaluate(batch); + thenColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } else { + // Evaluate ELSE expression (only) and copy all its results. + childExpressions[2].evaluate(batch); + elseColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } + return; + } + + // NOTE: We cannot use copySelected below since it is a whole column operation. + + conditionalEvaluate(batch, childExpressions[1], thenSelected, thenCount); + for (int i = 0; i < thenCount; i++) { + final int batchIndex = thenSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, thenColVector); + } + + conditionalEvaluate(batch, childExpressions[2], elseSelected, elseCount); + for (int i = 0; i < elseCount; i++) { + final int batchIndex = elseSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, elseColVector); + } + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, arg1Column) + ", " + getColumnParamString(1, arg2Column) + + getColumnParamString(2, arg3Column); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprNull.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprNull.java new file mode 100644 index 0000000..591e85b --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprCondExprNull.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Do conditional execution of the THEN vector expression with NULL ELSE of a SQL IF statement. + */ +public class IfExprCondExprNull extends IfExprCondExprBase { + private static final long serialVersionUID = 1L; + + protected final int arg2Column; + + public IfExprCondExprNull(int arg1Column, int arg2Column, int outputColumnNum) { + super(arg1Column, outputColumnNum); + this.arg2Column = arg2Column; + } + + public IfExprCondExprNull() { + super(); + + // Dummy final assignments. + arg2Column = -1; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + int n = batch.size; + if (n <= 0) { + // Nothing to do + return; + } + + /* + * Do common analysis of the IF statement boolean expression. + * + * The following protected members can be examined afterwards: + * + * boolean isIfStatementResultRepeated + * boolean isIfStatementResultThen + * + * int thenSelectedCount + * int[] thenSelected + * int elseSelectedCount + * int[] elseSelected + */ + super.evaluate(batch); + + ColumnVector outputColVector = batch.cols[outputColumnNum]; + boolean[] outputIsNull = outputColVector.isNull; + + // We do not need to do a column reset since we are carefully changing the output. + outputColVector.isRepeating = false; + + // CONSIDER: Should be do this for all vector expressions that can + // work on BytesColumnVector output columns??? + outputColVector.init(); + + ColumnVector thenColVector = batch.cols[arg2Column]; + + final int thenCount = thenSelectedCount; + final int elseCount = elseSelectedCount; + + if (isIfStatementResultRepeated) { + if (isIfStatementResultThen) { + // Evaluate THEN expression (only) and copy all its results. + childExpressions[1].evaluate(batch); + thenColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } else { + outputIsNull[0] = true; + outputColVector.noNulls = false; + outputColVector.isRepeating = true; + } + return; + } + + // NOTE: We cannot use copySelected below since it is a whole column operation. + + conditionalEvaluate(batch, childExpressions[1], thenSelected, thenCount); + for (int i = 0; i < thenCount; i++) { + final int batchIndex = thenSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, thenColVector); + } + + outputColVector.noNulls = false; + for (int i = 0; i < elseCount; i++) { + outputColVector.isNull[elseSelected[i]] = true; + } + } + + @Override + public String vectorExpressionParameters() { + return getColumnParamString(0, arg1Column) + ", " + getColumnParamString(1, arg2Column) + + ", null"; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprNullCondExpr.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprNullCondExpr.java new file mode 100644 index 0000000..0b99175 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprNullCondExpr.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Do conditional execution of a NULL THEN and a ELSE vector expression of a SQL IF statement. + */ +public class IfExprNullCondExpr extends IfExprCondExprBase { + private static final long serialVersionUID = 1L; + + protected final int arg3Column; + + public IfExprNullCondExpr(int arg1Column, int arg3Column, int outputColumnNum) { + super(arg1Column, outputColumnNum); + this.arg3Column = arg3Column; + } + + public IfExprNullCondExpr() { + super(); + + // Dummy final assignments. + arg3Column = -1; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + int n = batch.size; + if (n <= 0) { + // Nothing to do + return; + } + + /* + * Do common analysis of the IF statement boolean expression. + * + * The following protected members can be examined afterwards: + * + * boolean isIfStatementResultRepeated + * boolean isIfStatementResultThen + * + * int thenSelectedCount + * int[] thenSelected + * int elseSelectedCount + * int[] elseSelected + */ + super.evaluate(batch); + + ColumnVector outputColVector = batch.cols[outputColumnNum]; + boolean[] outputIsNull = outputColVector.isNull; + + // We do not need to do a column reset since we are carefully changing the output. + outputColVector.isRepeating = false; + + // CONSIDER: Should be do this for all vector expressions that can + // work on BytesColumnVector output columns??? + outputColVector.init(); + + ColumnVector elseColVector = batch.cols[arg3Column]; + + final int thenCount = thenSelectedCount; + final int elseCount = elseSelectedCount; + + if (isIfStatementResultRepeated) { + if (isIfStatementResultThen) { + outputIsNull[0] = true; + outputColVector.noNulls = false; + outputColVector.isRepeating = true; + } else { + // Evaluate ELSE expression (only) and copy all its results. + // Second input parameter but 3rd column. + childExpressions[1].evaluate(batch); + elseColVector.copySelected(batch.selectedInUse, batch.selected, n, outputColVector); + } + return; + } + + // NOTE: We cannot use copySelected below since it is a whole column operation. + + outputColVector.noNulls = false; + for (int i = 0; i < thenCount; i++) { + outputColVector.isNull[thenSelected[i]] = true; + } + + // Second input parameter but 3rd column. + conditionalEvaluate(batch, childExpressions[1], elseSelected, elseCount); + for (int i = 0; i < elseCount; i++) { + final int batchIndex = elseSelected[i]; + outputIsNull[batchIndex] = false; + outputColVector.setElement(batchIndex, batchIndex, elseColVector); + } + } + + @Override + public String vectorExpressionParameters() { + // Second input parameter but 3rd column. + return getColumnParamString(0, arg1Column) + ", null, " + getColumnParamString(2, arg3Column); + } +} \ No newline at end of file diff --git ql/src/test/queries/clientpositive/vector_udf_adaptor_1.q ql/src/test/queries/clientpositive/vector_udf_adaptor_1.q index 2eb0a0a..565edee 100644 --- ql/src/test/queries/clientpositive/vector_udf_adaptor_1.q +++ ql/src/test/queries/clientpositive/vector_udf_adaptor_1.q @@ -1,27 +1,209 @@ +set hive.cli.print.header=true; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; set hive.stats.column.autogather=false; -create table student_2_lines( +-- SORT_QUERY_RESULTS + +create table student_10_lines_txt( name string, age int, gpa double) row format delimited fields terminated by '\001' stored as textfile; -LOAD DATA LOCAL INPATH '../../data/files/student_2_lines' OVERWRITE INTO TABLE student_2_lines; -analyze table student_2_lines compute statistics; +LOAD DATA LOCAL INPATH '../../data/files/student_10_lines' OVERWRITE INTO TABLE student_10_lines_txt; +CREATE TABLE student_10_lines STORED AS ORC AS SELECT * FROM student_10_lines_txt; +INSERT INTO TABLE student_10_lines VALUES (NULL, NULL, NULL); +INSERT INTO TABLE student_10_lines VALUES ("George", 22, 3.8); +analyze table student_10_lines compute statistics; + +------------------------------------------------------------------------------------------ + +SET hive.vectorized.if.expr.mode=adaptor; + +create table insert_a_adaptor (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double); + +explain vectorization detail +insert overwrite table insert_a_adaptor + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines; +insert overwrite table insert_a_adaptor + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines; +select * from insert_a_adaptor; + +SET hive.vectorized.if.expr.mode=good; + +create table insert_a_good (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double); + +explain vectorization detail +insert overwrite table insert_a_good + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines; +insert overwrite table insert_a_good + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines; +select * from insert_a_good; + +SET hive.vectorized.if.expr.mode=better; + +create table insert_a_better (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double); + +explain vectorization detail +insert overwrite table insert_a_better + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines; +insert overwrite table insert_a_better + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines; +select * from insert_a_better; + +------------------------------------------------------------------------------------------ + +SET hive.vectorized.if.expr.mode=adaptor; + +create table insert_b_adaptor (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double); + +explain vectorization detail +insert overwrite table insert_b_adaptor + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines; +insert overwrite table insert_b_adaptor + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines; +select * from insert_b_adaptor; + +SET hive.vectorized.if.expr.mode=good; + +create table insert_b_good (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double); + +explain vectorization detail +insert overwrite table insert_b_good + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines; +insert overwrite table insert_b_good + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines; +select * from insert_b_good; + +SET hive.vectorized.if.expr.mode=better; -create table insert_10_1 (a float, b int, c timestamp, d binary); +create table insert_b_better (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double); explain vectorization detail -insert overwrite table insert_10_1 - select cast(gpa as float), - age, - IF(age>40,cast('2011-01-01 01:01:01' as timestamp),NULL), - IF(LENGTH(name)>10,cast(name as binary),NULL) from student_2_lines; -insert overwrite table insert_10_1 - select cast(gpa as float), - age, - IF(age>40,cast('2011-01-01 01:01:01' as timestamp),NULL), - IF(LENGTH(name)>10,cast(name as binary),NULL) from student_2_lines; \ No newline at end of file +insert overwrite table insert_b_better + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines; +insert overwrite table insert_b_better + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines; +select * from insert_b_better; \ No newline at end of file diff --git ql/src/test/results/clientpositive/llap/vector_udf_adaptor_1.q.out ql/src/test/results/clientpositive/llap/vector_udf_adaptor_1.q.out index a752dfa..2d23730 100644 --- ql/src/test/results/clientpositive/llap/vector_udf_adaptor_1.q.out +++ ql/src/test/results/clientpositive/llap/vector_udf_adaptor_1.q.out @@ -1,4 +1,4 @@ -PREHOOK: query: create table student_2_lines( +PREHOOK: query: create table student_10_lines_txt( name string, age int, gpa double) @@ -7,8 +7,8 @@ fields terminated by '\001' stored as textfile PREHOOK: type: CREATETABLE PREHOOK: Output: database:default -PREHOOK: Output: default@student_2_lines -POSTHOOK: query: create table student_2_lines( +PREHOOK: Output: default@student_10_lines_txt +POSTHOOK: query: create table student_10_lines_txt( name string, age int, gpa double) @@ -17,45 +17,431 @@ fields terminated by '\001' stored as textfile POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default -POSTHOOK: Output: default@student_2_lines -PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/student_2_lines' OVERWRITE INTO TABLE student_2_lines +POSTHOOK: Output: default@student_10_lines_txt +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/student_10_lines' OVERWRITE INTO TABLE student_10_lines_txt PREHOOK: type: LOAD #### A masked pattern was here #### -PREHOOK: Output: default@student_2_lines -POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/student_2_lines' OVERWRITE INTO TABLE student_2_lines +PREHOOK: Output: default@student_10_lines_txt +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/student_10_lines' OVERWRITE INTO TABLE student_10_lines_txt POSTHOOK: type: LOAD #### A masked pattern was here #### -POSTHOOK: Output: default@student_2_lines -PREHOOK: query: analyze table student_2_lines compute statistics +POSTHOOK: Output: default@student_10_lines_txt +PREHOOK: query: CREATE TABLE student_10_lines STORED AS ORC AS SELECT * FROM student_10_lines_txt +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@student_10_lines_txt +PREHOOK: Output: database:default +PREHOOK: Output: default@student_10_lines +POSTHOOK: query: CREATE TABLE student_10_lines STORED AS ORC AS SELECT * FROM student_10_lines_txt +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@student_10_lines_txt +POSTHOOK: Output: database:default +POSTHOOK: Output: default@student_10_lines +POSTHOOK: Lineage: student_10_lines.age SIMPLE [(student_10_lines_txt)student_10_lines_txt.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: student_10_lines.gpa SIMPLE [(student_10_lines_txt)student_10_lines_txt.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: student_10_lines.name SIMPLE [(student_10_lines_txt)student_10_lines_txt.FieldSchema(name:name, type:string, comment:null), ] +student_10_lines_txt.name student_10_lines_txt.age student_10_lines_txt.gpa +PREHOOK: query: INSERT INTO TABLE student_10_lines VALUES (NULL, NULL, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@student_10_lines +POSTHOOK: query: INSERT INTO TABLE student_10_lines VALUES (NULL, NULL, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@student_10_lines +POSTHOOK: Lineage: student_10_lines.age EXPRESSION [] +POSTHOOK: Lineage: student_10_lines.gpa EXPRESSION [] +POSTHOOK: Lineage: student_10_lines.name EXPRESSION [] +_col0 _col1 _col2 +PREHOOK: query: INSERT INTO TABLE student_10_lines VALUES ("George", 22, 3.8) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@student_10_lines +POSTHOOK: query: INSERT INTO TABLE student_10_lines VALUES ("George", 22, 3.8) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@student_10_lines +POSTHOOK: Lineage: student_10_lines.age SCRIPT [] +POSTHOOK: Lineage: student_10_lines.gpa SCRIPT [] +POSTHOOK: Lineage: student_10_lines.name SCRIPT [] +_col0 _col1 _col2 +PREHOOK: query: analyze table student_10_lines compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@student_10_lines +PREHOOK: Output: default@student_10_lines +POSTHOOK: query: analyze table student_10_lines compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@student_10_lines +POSTHOOK: Output: default@student_10_lines +student_10_lines.name student_10_lines.age student_10_lines.gpa +PREHOOK: query: create table insert_a_adaptor (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@insert_a_adaptor +POSTHOOK: query: create table insert_a_adaptor (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@insert_a_adaptor +PREHOOK: query: explain vectorization detail +insert overwrite table insert_a_adaptor + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +insert overwrite table insert_a_adaptor + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +POSTHOOK: type: QUERY +Explain +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: student_10_lines + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: name (type: string), age (type: int), gpa (type: double), if((age < 40), age, null) (type: int), if((age > 40), 2011-01-01 01:01:01.0, null) (type: timestamp), if((length(name) > 8), name, null) (type: string), if((length(name) < 8), CAST( name AS BINARY), null) (type: binary), if((age > 40), length(name), null) (type: int), if((length(name) > 10), (2.0 * gpa), null) (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_a_adaptor + Execution mode: llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + notVectorizedReason: SELECT operator: Unexpected primitive type category VOID + vectorized: false + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_a_adaptor + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: insert overwrite table insert_a_adaptor + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +PREHOOK: type: QUERY +PREHOOK: Input: default@student_10_lines +PREHOOK: Output: default@insert_a_adaptor +POSTHOOK: query: insert overwrite table insert_a_adaptor + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +POSTHOOK: type: QUERY +POSTHOOK: Input: default@student_10_lines +POSTHOOK: Output: default@insert_a_adaptor +POSTHOOK: Lineage: insert_a_adaptor.a EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.age SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.b EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.c EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.d EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.e EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.f EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.gpa SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_a_adaptor.name SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +name age gpa _c3 _c4 _c5 _c6 _c7 _c8 +PREHOOK: query: select * from insert_a_adaptor +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_a_adaptor +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_a_adaptor +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_a_adaptor +#### A masked pattern was here #### +insert_a_adaptor.name insert_a_adaptor.age insert_a_adaptor.gpa insert_a_adaptor.a insert_a_adaptor.b insert_a_adaptor.c insert_a_adaptor.d insert_a_adaptor.e insert_a_adaptor.f +George 22 3.8 22 NULL NULL George NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL NULL +calvin brown 28 2.7 28 NULL calvin brown NULL NULL 5.4 +luke brown 60 1.14 NULL 2011-01-01 01:01:01 luke brown NULL 10 NULL +luke king 28 0.47 28 NULL luke king NULL NULL NULL +nick johnson 34 NULL 34 NULL nick johnson NULL NULL NULL +oscar thompson 35 2.98 35 NULL oscar thompson NULL NULL 5.96 +priscilla falkner 55 1.16 NULL 2011-01-01 01:01:01 priscilla falkner NULL 17 2.32 +quinn ovid 19 NULL 19 NULL quinn ovid NULL NULL NULL +tom thompson 42 0.53 NULL 2011-01-01 01:01:01 tom thompson NULL 12 1.06 +ulysses garcia 35 2.74 35 NULL ulysses garcia NULL NULL 5.48 +xavier garcia 33 1.06 33 NULL xavier garcia NULL NULL 2.12 +PREHOOK: query: create table insert_a_good (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@insert_a_good +POSTHOOK: query: create table insert_a_good (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@insert_a_good +PREHOOK: query: explain vectorization detail +insert overwrite table insert_a_good + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +insert overwrite table insert_a_good + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +POSTHOOK: type: QUERY +Explain +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: student_10_lines + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:name:string, 1:age:int, 2:gpa:double, 3:ROW__ID:struct] + Select Operator + expressions: name (type: string), age (type: int), gpa (type: double), if((age < 40), age, null) (type: int), if((age > 40), 2011-01-01 01:01:01.0, null) (type: timestamp), if((length(name) > 8), name, null) (type: string), if((length(name) < 8), CAST( name AS BINARY), null) (type: binary), if((age > 40), length(name), null) (type: int), if((length(name) > 10), (2.0 * gpa), null) (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1, 2, 5, 8, 11, 14, 16, 20] + selectExpressions: IfExprColumnNull(col 4:boolean, col 1:int, null)(children: LongColLessLongScalar(col 1:int, val 40) -> 4:boolean, col 1:int) -> 5:int, IfExprColumnNull(col 6:boolean, col 7:timestamp, null)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 6:boolean, ConstantVectorExpression(val 2011-01-01 01:01:01.0) -> 7:timestamp) -> 8:timestamp, IfExprColumnNull(col 10:boolean, col 0:string, null)(children: LongColGreaterLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 10:boolean, col 0:string) -> 11:string, IfExprColumnNull(col 12:boolean, col 13:binary, null)(children: LongColLessLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 12:boolean, VectorUDFAdaptor(CAST( name AS BINARY)) -> 13:binary) -> 14:binary, IfExprColumnNull(col 9:boolean, col 15:int, null)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 9:boolean, StringLength(col 0:string) -> 15:int) -> 16:int, IfExprColumnNull(col 18:boolean, col 19:double, null)(children: LongColGreaterLongScalar(col 17:int, val 10)(children: StringLength(col 0:string) -> 17:int) -> 18:boolean, DoubleScalarMultiplyDoubleColumn(val 2.0, col 2:double) -> 19:double) -> 20:double + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_a_good + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: true + vectorized: true + rowBatchContext: + dataColumnCount: 3 + includeColumns: [0, 1, 2] + dataColumns: name:string, age:int, gpa:double + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, bigint, bigint, timestamp, timestamp, bigint, bigint, string, bigint, string, string, bigint, bigint, bigint, bigint, double, double] + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_a_good + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: insert overwrite table insert_a_good + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +PREHOOK: type: QUERY +PREHOOK: Input: default@student_10_lines +PREHOOK: Output: default@insert_a_good +POSTHOOK: query: insert overwrite table insert_a_good + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +POSTHOOK: type: QUERY +POSTHOOK: Input: default@student_10_lines +POSTHOOK: Output: default@insert_a_good +POSTHOOK: Lineage: insert_a_good.a EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_good.age SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_good.b EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_good.c EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_good.d EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_good.e EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_good.f EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_a_good.gpa SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_a_good.name SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +name age gpa _c3 _c4 _c5 _c6 _c7 _c8 +PREHOOK: query: select * from insert_a_good PREHOOK: type: QUERY -PREHOOK: Input: default@student_2_lines -PREHOOK: Output: default@student_2_lines -POSTHOOK: query: analyze table student_2_lines compute statistics +PREHOOK: Input: default@insert_a_good +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_a_good POSTHOOK: type: QUERY -POSTHOOK: Input: default@student_2_lines -POSTHOOK: Output: default@student_2_lines -PREHOOK: query: create table insert_10_1 (a float, b int, c timestamp, d binary) +POSTHOOK: Input: default@insert_a_good +#### A masked pattern was here #### +insert_a_good.name insert_a_good.age insert_a_good.gpa insert_a_good.a insert_a_good.b insert_a_good.c insert_a_good.d insert_a_good.e insert_a_good.f +George 22 3.8 22 NULL NULL George NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL NULL +calvin brown 28 2.7 28 NULL calvin brown NULL NULL 5.4 +luke brown 60 1.14 NULL 2011-01-01 01:01:01 luke brown NULL 10 NULL +luke king 28 0.47 28 NULL luke king NULL NULL NULL +nick johnson 34 NULL 34 NULL nick johnson NULL NULL NULL +oscar thompson 35 2.98 35 NULL oscar thompson NULL NULL 5.96 +priscilla falkner 55 1.16 NULL 2011-01-01 01:01:01 priscilla falkner NULL 17 2.32 +quinn ovid 19 NULL 19 NULL quinn ovid NULL NULL NULL +tom thompson 42 0.53 NULL 2011-01-01 01:01:01 tom thompson NULL 12 1.06 +ulysses garcia 35 2.74 35 NULL ulysses garcia NULL NULL 5.48 +xavier garcia 33 1.06 33 NULL xavier garcia NULL NULL 2.12 +PREHOOK: query: create table insert_a_better (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default -PREHOOK: Output: default@insert_10_1 -POSTHOOK: query: create table insert_10_1 (a float, b int, c timestamp, d binary) +PREHOOK: Output: default@insert_a_better +POSTHOOK: query: create table insert_a_better (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default -POSTHOOK: Output: default@insert_10_1 +POSTHOOK: Output: default@insert_a_better PREHOOK: query: explain vectorization detail -insert overwrite table insert_10_1 - select cast(gpa as float), - age, - IF(age>40,cast('2011-01-01 01:01:01' as timestamp),NULL), - IF(LENGTH(name)>10,cast(name as binary),NULL) from student_2_lines +insert overwrite table insert_a_better + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines PREHOOK: type: QUERY POSTHOOK: query: explain vectorization detail -insert overwrite table insert_10_1 - select cast(gpa as float), - age, - IF(age>40,cast('2011-01-01 01:01:01' as timestamp),NULL), - IF(LENGTH(name)>10,cast(name as binary),NULL) from student_2_lines +insert overwrite table insert_a_better + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines POSTHOOK: type: QUERY +Explain PLAN VECTORIZATION: enabled: true enabledConditionsMet: [hive.vectorized.execution.enabled IS true] @@ -74,40 +460,39 @@ STAGE PLANS: Map 1 Map Operator Tree: TableScan - alias: student_2_lines - Statistics: Num rows: 2 Data size: 392 Basic stats: COMPLETE Column stats: NONE + alias: student_10_lines + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE TableScan Vectorization: native: true vectorizationSchemaColumns: [0:name:string, 1:age:int, 2:gpa:double, 3:ROW__ID:struct] Select Operator - expressions: UDFToFloat(gpa) (type: float), age (type: int), if((age > 40), 2011-01-01 01:01:01.0, null) (type: timestamp), if((length(name) > 10), CAST( name AS BINARY), null) (type: binary) - outputColumnNames: _col0, _col1, _col2, _col3 + expressions: name (type: string), age (type: int), gpa (type: double), if((age < 40), age, null) (type: int), if((age > 40), 2011-01-01 01:01:01.0, null) (type: timestamp), if((length(name) > 8), name, null) (type: string), if((length(name) < 8), CAST( name AS BINARY), null) (type: binary), if((age > 40), length(name), null) (type: int), if((length(name) > 10), (2.0 * gpa), null) (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Vectorization: className: VectorSelectOperator native: true - projectedOutputColumnNums: [2, 1, 5, 8] - selectExpressions: VectorUDFAdaptor(if((age > 40), 2011-01-01 01:01:01.0, null))(children: LongColGreaterLongScalar(col 1:int, val 40) -> 4:boolean) -> 5:timestamp, VectorUDFAdaptor(if((length(name) > 10), CAST( name AS BINARY), null))(children: LongColGreaterLongScalar(col 4:int, val 10)(children: StringLength(col 0:string) -> 4:int) -> 6:boolean, VectorUDFAdaptor(CAST( name AS BINARY)) -> 7:binary) -> 8:binary - Statistics: Num rows: 2 Data size: 392 Basic stats: COMPLETE Column stats: NONE + projectedOutputColumnNums: [0, 1, 2, 5, 8, 11, 14, 16, 20] + selectExpressions: IfExprColumnNull(col 4:boolean, col 1:int, null)(children: LongColLessLongScalar(col 1:int, val 40) -> 4:boolean, col 1:int) -> 5:int, IfExprColumnNull(col 6:boolean, col 7:timestamp, null)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 6:boolean, ConstantVectorExpression(val 2011-01-01 01:01:01.0) -> 7:timestamp) -> 8:timestamp, IfExprColumnNull(col 10:boolean, col 0:string, null)(children: LongColGreaterLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 10:boolean, col 0:string) -> 11:string, IfExprCondExprNull(col 12:boolean, col 13:binary, null)(children: LongColLessLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 12:boolean, VectorUDFAdaptor(CAST( name AS BINARY)) -> 13:binary) -> 14:binary, IfExprCondExprNull(col 9:boolean, col 15:int, null)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 9:boolean, StringLength(col 0:string) -> 15:int) -> 16:int, IfExprCondExprNull(col 18:boolean, col 19:double, null)(children: LongColGreaterLongScalar(col 17:int, val 10)(children: StringLength(col 0:string) -> 17:int) -> 18:boolean, DoubleScalarMultiplyDoubleColumn(val 2.0, col 2:double) -> 19:double) -> 20:double + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 2 Data size: 392 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.insert_10_1 + name: default.insert_a_better Execution mode: vectorized, llap - LLAP IO: no inputs + LLAP IO: all inputs Map Vectorization: enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - vectorizationSupportRemovedReasons: [DECIMAL_64 disabled because LLAP is enabled] + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] featureSupportInUse: [] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat allNative: false usesVectorUDFAdaptor: true vectorized: true @@ -116,7 +501,7 @@ STAGE PLANS: includeColumns: [0, 1, 2] dataColumns: name:string, age:int, gpa:double partitionColumnCount: 0 - scratchColumnTypeNames: [bigint, timestamp, bigint, string, string] + scratchColumnTypeNames: [bigint, bigint, bigint, timestamp, timestamp, bigint, bigint, string, bigint, string, string, bigint, bigint, bigint, bigint, double, double] Stage: Stage-2 Dependency Collection @@ -129,29 +514,578 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.insert_10_1 + name: default.insert_a_better Stage: Stage-3 Stats Work Basic Stats Work: -PREHOOK: query: insert overwrite table insert_10_1 - select cast(gpa as float), - age, - IF(age>40,cast('2011-01-01 01:01:01' as timestamp),NULL), - IF(LENGTH(name)>10,cast(name as binary),NULL) from student_2_lines -PREHOOK: type: QUERY -PREHOOK: Input: default@student_2_lines -PREHOOK: Output: default@insert_10_1 -POSTHOOK: query: insert overwrite table insert_10_1 - select cast(gpa as float), - age, - IF(age>40,cast('2011-01-01 01:01:01' as timestamp),NULL), - IF(LENGTH(name)>10,cast(name as binary),NULL) from student_2_lines -POSTHOOK: type: QUERY -POSTHOOK: Input: default@student_2_lines -POSTHOOK: Output: default@insert_10_1 -POSTHOOK: Lineage: insert_10_1.a EXPRESSION [(student_2_lines)student_2_lines.FieldSchema(name:gpa, type:double, comment:null), ] -POSTHOOK: Lineage: insert_10_1.b SIMPLE [(student_2_lines)student_2_lines.FieldSchema(name:age, type:int, comment:null), ] -POSTHOOK: Lineage: insert_10_1.c EXPRESSION [(student_2_lines)student_2_lines.FieldSchema(name:age, type:int, comment:null), ] -POSTHOOK: Lineage: insert_10_1.d EXPRESSION [(student_2_lines)student_2_lines.FieldSchema(name:name, type:string, comment:null), ] +PREHOOK: query: insert overwrite table insert_a_better + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +PREHOOK: type: QUERY +PREHOOK: Input: default@student_10_lines +PREHOOK: Output: default@insert_a_better +POSTHOOK: query: insert overwrite table insert_a_better + select + name, + age, + gpa, + IF(age<40, age, NULL), + IF(age>40, cast('2011-01-01 01:01:01' as timestamp), NULL), + IF(LENGTH(name)>8, name, NULL), + IF(LENGTH(name)<8, cast(name as binary), NULL), + IF(age>40, LENGTH(name), NULL), + IF(LENGTH(name)> 10, 2 * gpa, NULL) + from student_10_lines +POSTHOOK: type: QUERY +POSTHOOK: Input: default@student_10_lines +POSTHOOK: Output: default@insert_a_better +POSTHOOK: Lineage: insert_a_better.a EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_better.age SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_better.b EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_a_better.c EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_better.d EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_better.e EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_a_better.f EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_a_better.gpa SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_a_better.name SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +name age gpa _c3 _c4 _c5 _c6 _c7 _c8 +PREHOOK: query: select * from insert_a_better +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_a_better +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_a_better +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_a_better +#### A masked pattern was here #### +insert_a_better.name insert_a_better.age insert_a_better.gpa insert_a_better.a insert_a_better.b insert_a_better.c insert_a_better.d insert_a_better.e insert_a_better.f +George 22 3.8 22 NULL NULL George NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL NULL +calvin brown 28 2.7 28 NULL calvin brown NULL NULL 5.4 +luke brown 60 1.14 NULL 2011-01-01 01:01:01 luke brown NULL 10 NULL +luke king 28 0.47 28 NULL luke king NULL NULL NULL +nick johnson 34 NULL 34 NULL nick johnson NULL NULL NULL +oscar thompson 35 2.98 35 NULL oscar thompson NULL NULL 5.96 +priscilla falkner 55 1.16 NULL 2011-01-01 01:01:01 priscilla falkner NULL 17 2.32 +quinn ovid 19 NULL 19 NULL quinn ovid NULL NULL NULL +tom thompson 42 0.53 NULL 2011-01-01 01:01:01 tom thompson NULL 12 1.06 +ulysses garcia 35 2.74 35 NULL ulysses garcia NULL NULL 5.48 +xavier garcia 33 1.06 33 NULL xavier garcia NULL NULL 2.12 +PREHOOK: query: create table insert_b_adaptor (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@insert_b_adaptor +POSTHOOK: query: create table insert_b_adaptor (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@insert_b_adaptor +PREHOOK: query: explain vectorization detail +insert overwrite table insert_b_adaptor + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +insert overwrite table insert_b_adaptor + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +POSTHOOK: type: QUERY +Explain +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: student_10_lines + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: name (type: string), age (type: int), gpa (type: double), if((age < 40), null, age) (type: int), if((age > 40), null, 2011-01-01 01:01:01.0) (type: timestamp), if((length(name) > 8), null, name) (type: string), if((length(name) < 8), null, CAST( name AS BINARY)) (type: binary), if((age > 40), null, length(name)) (type: int), if((length(name) > 10), null, (2.0 * gpa)) (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_b_adaptor + Execution mode: llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + notVectorizedReason: SELECT operator: Unexpected primitive type category VOID + vectorized: false + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_b_adaptor + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: insert overwrite table insert_b_adaptor + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +PREHOOK: type: QUERY +PREHOOK: Input: default@student_10_lines +PREHOOK: Output: default@insert_b_adaptor +POSTHOOK: query: insert overwrite table insert_b_adaptor + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +POSTHOOK: type: QUERY +POSTHOOK: Input: default@student_10_lines +POSTHOOK: Output: default@insert_b_adaptor +POSTHOOK: Lineage: insert_b_adaptor.a EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.age SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.b EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.c EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.d EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.e EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.f EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.gpa SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_b_adaptor.name SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +name age gpa _c3 _c4 _c5 _c6 _c7 _c8 +PREHOOK: query: select * from insert_b_adaptor +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_b_adaptor +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_b_adaptor +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_b_adaptor +#### A masked pattern was here #### +insert_b_adaptor.name insert_b_adaptor.age insert_b_adaptor.gpa insert_b_adaptor.a insert_b_adaptor.b insert_b_adaptor.c insert_b_adaptor.d insert_b_adaptor.e insert_b_adaptor.f +George 22 3.8 NULL 2011-01-01 01:01:01 George NULL 6 7.6 +NULL NULL NULL NULL 2011-01-01 01:01:01 NULL NULL NULL NULL +calvin brown 28 2.7 NULL 2011-01-01 01:01:01 NULL calvin brown 12 NULL +luke brown 60 1.14 60 NULL NULL luke brown NULL 2.28 +luke king 28 0.47 NULL 2011-01-01 01:01:01 NULL luke king 9 0.94 +nick johnson 34 NULL NULL 2011-01-01 01:01:01 NULL nick johnson 12 NULL +oscar thompson 35 2.98 NULL 2011-01-01 01:01:01 NULL oscar thompson 14 NULL +priscilla falkner 55 1.16 55 NULL NULL priscilla falkner NULL NULL +quinn ovid 19 NULL NULL 2011-01-01 01:01:01 NULL quinn ovid 10 NULL +tom thompson 42 0.53 42 NULL NULL tom thompson NULL NULL +ulysses garcia 35 2.74 NULL 2011-01-01 01:01:01 NULL ulysses garcia 14 NULL +xavier garcia 33 1.06 NULL 2011-01-01 01:01:01 NULL xavier garcia 13 NULL +PREHOOK: query: create table insert_b_good (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@insert_b_good +POSTHOOK: query: create table insert_b_good (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@insert_b_good +PREHOOK: query: explain vectorization detail +insert overwrite table insert_b_good + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +insert overwrite table insert_b_good + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +POSTHOOK: type: QUERY +Explain +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: student_10_lines + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:name:string, 1:age:int, 2:gpa:double, 3:ROW__ID:struct] + Select Operator + expressions: name (type: string), age (type: int), gpa (type: double), if((age < 40), null, age) (type: int), if((age > 40), null, 2011-01-01 01:01:01.0) (type: timestamp), if((length(name) > 8), null, name) (type: string), if((length(name) < 8), null, CAST( name AS BINARY)) (type: binary), if((age > 40), null, length(name)) (type: int), if((length(name) > 10), null, (2.0 * gpa)) (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1, 2, 5, 8, 11, 14, 16, 20] + selectExpressions: IfExprNullColumn(col 4:boolean, null, col 1)(children: LongColLessLongScalar(col 1:int, val 40) -> 4:boolean, col 1:int) -> 5:int, IfExprNullColumn(col 6:boolean, null, col 7)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 6:boolean, ConstantVectorExpression(val 2011-01-01 01:01:01.0) -> 7:timestamp) -> 8:timestamp, IfExprNullColumn(col 10:boolean, null, col 0)(children: LongColGreaterLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 10:boolean, col 0:string) -> 11:string, IfExprNullColumn(col 12:boolean, null, col 13)(children: LongColLessLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 12:boolean, VectorUDFAdaptor(CAST( name AS BINARY)) -> 13:binary) -> 14:binary, IfExprNullColumn(col 9:boolean, null, col 15)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 9:boolean, StringLength(col 0:string) -> 15:int) -> 16:int, IfExprNullColumn(col 18:boolean, null, col 19)(children: LongColGreaterLongScalar(col 17:int, val 10)(children: StringLength(col 0:string) -> 17:int) -> 18:boolean, DoubleScalarMultiplyDoubleColumn(val 2.0, col 2:double) -> 19:double) -> 20:double + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_b_good + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: true + vectorized: true + rowBatchContext: + dataColumnCount: 3 + includeColumns: [0, 1, 2] + dataColumns: name:string, age:int, gpa:double + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, bigint, bigint, timestamp, timestamp, bigint, bigint, string, bigint, string, string, bigint, bigint, bigint, bigint, double, double] + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_b_good + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: insert overwrite table insert_b_good + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +PREHOOK: type: QUERY +PREHOOK: Input: default@student_10_lines +PREHOOK: Output: default@insert_b_good +POSTHOOK: query: insert overwrite table insert_b_good + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +POSTHOOK: type: QUERY +POSTHOOK: Input: default@student_10_lines +POSTHOOK: Output: default@insert_b_good +POSTHOOK: Lineage: insert_b_good.a EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_good.age SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_good.b EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_good.c EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_good.d EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_good.e EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_good.f EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_b_good.gpa SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_b_good.name SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +name age gpa _c3 _c4 _c5 _c6 _c7 _c8 +PREHOOK: query: select * from insert_b_good +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_b_good +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_b_good +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_b_good +#### A masked pattern was here #### +insert_b_good.name insert_b_good.age insert_b_good.gpa insert_b_good.a insert_b_good.b insert_b_good.c insert_b_good.d insert_b_good.e insert_b_good.f +George 22 3.8 NULL 2011-01-01 01:01:01 George NULL 6 7.6 +NULL NULL NULL NULL 2011-01-01 01:01:01 NULL NULL NULL NULL +calvin brown 28 2.7 NULL 2011-01-01 01:01:01 NULL calvin brown 12 NULL +luke brown 60 1.14 60 NULL NULL luke brown NULL 2.28 +luke king 28 0.47 NULL 2011-01-01 01:01:01 NULL luke king 9 0.94 +nick johnson 34 NULL NULL 2011-01-01 01:01:01 NULL nick johnson 12 NULL +oscar thompson 35 2.98 NULL 2011-01-01 01:01:01 NULL oscar thompson 14 NULL +priscilla falkner 55 1.16 55 NULL NULL priscilla falkner NULL NULL +quinn ovid 19 NULL NULL 2011-01-01 01:01:01 NULL quinn ovid 10 NULL +tom thompson 42 0.53 42 NULL NULL tom thompson NULL NULL +ulysses garcia 35 2.74 NULL 2011-01-01 01:01:01 NULL ulysses garcia 14 NULL +xavier garcia 33 1.06 NULL 2011-01-01 01:01:01 NULL xavier garcia 13 NULL +PREHOOK: query: create table insert_b_better (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@insert_b_better +POSTHOOK: query: create table insert_b_better (name string, age int, gpa double, a int, b timestamp, c string, d binary, e int, f double) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@insert_b_better +PREHOOK: query: explain vectorization detail +insert overwrite table insert_b_better + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +insert overwrite table insert_b_better + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +POSTHOOK: type: QUERY +Explain +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: student_10_lines + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:name:string, 1:age:int, 2:gpa:double, 3:ROW__ID:struct] + Select Operator + expressions: name (type: string), age (type: int), gpa (type: double), if((age < 40), null, age) (type: int), if((age > 40), null, 2011-01-01 01:01:01.0) (type: timestamp), if((length(name) > 8), null, name) (type: string), if((length(name) < 8), null, CAST( name AS BINARY)) (type: binary), if((age > 40), null, length(name)) (type: int), if((length(name) > 10), null, (2.0 * gpa)) (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1, 2, 5, 8, 11, 14, 16, 20] + selectExpressions: IfExprNullColumn(col 4:boolean, null, col 1)(children: LongColLessLongScalar(col 1:int, val 40) -> 4:boolean, col 1:int) -> 5:int, IfExprNullColumn(col 6:boolean, null, col 7)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 6:boolean, ConstantVectorExpression(val 2011-01-01 01:01:01.0) -> 7:timestamp) -> 8:timestamp, IfExprNullColumn(col 10:boolean, null, col 0)(children: LongColGreaterLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 10:boolean, col 0:string) -> 11:string, IfExprNullCondExpr(col 12:boolean, null, col 13:binary)(children: LongColLessLongScalar(col 9:int, val 8)(children: StringLength(col 0:string) -> 9:int) -> 12:boolean, VectorUDFAdaptor(CAST( name AS BINARY)) -> 13:binary) -> 14:binary, IfExprNullCondExpr(col 9:boolean, null, col 15:int)(children: LongColGreaterLongScalar(col 1:int, val 40) -> 9:boolean, StringLength(col 0:string) -> 15:int) -> 16:int, IfExprNullCondExpr(col 18:boolean, null, col 19:double)(children: LongColGreaterLongScalar(col 17:int, val 10)(children: StringLength(col 0:string) -> 17:int) -> 18:boolean, DoubleScalarMultiplyDoubleColumn(val 2.0, col 2:double) -> 19:double) -> 20:double + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 12 Data size: 2352 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_b_better + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: true + vectorized: true + rowBatchContext: + dataColumnCount: 3 + includeColumns: [0, 1, 2] + dataColumns: name:string, age:int, gpa:double + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, bigint, bigint, timestamp, timestamp, bigint, bigint, string, bigint, string, string, bigint, bigint, bigint, bigint, double, double] + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.insert_b_better + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: insert overwrite table insert_b_better + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +PREHOOK: type: QUERY +PREHOOK: Input: default@student_10_lines +PREHOOK: Output: default@insert_b_better +POSTHOOK: query: insert overwrite table insert_b_better + select + name, + age, + gpa, + IF(age<40, NULL, age), + IF(age>40, NULL, cast('2011-01-01 01:01:01' as timestamp)), + IF(LENGTH(name)>8, NULL, name), + IF(LENGTH(name)<8, NULL, cast(name as binary)), + IF(age>40, NULL, LENGTH(name)), + IF(LENGTH(name)> 10, NULL, 2 * gpa) + from student_10_lines +POSTHOOK: type: QUERY +POSTHOOK: Input: default@student_10_lines +POSTHOOK: Output: default@insert_b_better +POSTHOOK: Lineage: insert_b_better.a EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_better.age SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_better.b EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: insert_b_better.c EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_better.d EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_better.e EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:age, type:int, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: insert_b_better.f EXPRESSION [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), (student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_b_better.gpa SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:gpa, type:double, comment:null), ] +POSTHOOK: Lineage: insert_b_better.name SIMPLE [(student_10_lines)student_10_lines.FieldSchema(name:name, type:string, comment:null), ] +name age gpa _c3 _c4 _c5 _c6 _c7 _c8 +PREHOOK: query: select * from insert_b_better +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_b_better +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_b_better +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_b_better +#### A masked pattern was here #### +insert_b_better.name insert_b_better.age insert_b_better.gpa insert_b_better.a insert_b_better.b insert_b_better.c insert_b_better.d insert_b_better.e insert_b_better.f +George 22 3.8 NULL 2011-01-01 01:01:01 George NULL 6 7.6 +NULL NULL NULL NULL 2011-01-01 01:01:01 NULL NULL NULL NULL +calvin brown 28 2.7 NULL 2011-01-01 01:01:01 NULL calvin brown 12 NULL +luke brown 60 1.14 60 NULL NULL luke brown NULL 2.28 +luke king 28 0.47 NULL 2011-01-01 01:01:01 NULL luke king 9 0.94 +nick johnson 34 NULL NULL 2011-01-01 01:01:01 NULL nick johnson 12 NULL +oscar thompson 35 2.98 NULL 2011-01-01 01:01:01 NULL oscar thompson 14 NULL +priscilla falkner 55 1.16 55 NULL NULL priscilla falkner NULL NULL +quinn ovid 19 NULL NULL 2011-01-01 01:01:01 NULL quinn ovid 10 NULL +tom thompson 42 0.53 42 NULL NULL tom thompson NULL NULL +ulysses garcia 35 2.74 NULL 2011-01-01 01:01:01 NULL ulysses garcia 14 NULL +xavier garcia 33 1.06 NULL 2011-01-01 01:01:01 NULL xavier garcia 13 NULL