diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyFilter.java ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyFilter.java new file mode 100644 index 0000000000..4998766f06 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyFilter.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec; + +import java.util.Comparator; +import java.util.PriorityQueue; + +/** + * Implementation of filtering out keys. + * An instance of this class is wrapped in {@link TopNKeyOperator} and + * {@link org.apache.hadoop.hive.ql.exec.vector.VectorTopNKeyOperator} + * @param - Type of {@link KeyWrapper}. Each key is stored in a KeyWrapper instance. + */ +public class TopNKeyFilter { + private final PriorityQueue priorityQueue; + private final int topN; + + public TopNKeyFilter(int topN, Comparator comparator) { + // We need a reversed comparator because the PriorityQueue.poll() method is used for filtering out keys. + // Ex.: When ORDER BY key1 ASC then call of poll() should remove the largest key. + this.priorityQueue = new PriorityQueue<>(topN + 1, comparator.reversed()); + this.topN = topN; + } + + public boolean canForward(T kw) { + if (!priorityQueue.contains(kw)) { + priorityQueue.offer((T) kw.copyKey()); + } + if (priorityQueue.size() > topN) { + priorityQueue.poll(); + } + + return priorityQueue.contains(kw); + } + + public void clear() { + priorityQueue.clear(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java index d16500ef05..96b2968cb2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -30,7 +30,6 @@ import java.io.Serializable; import java.util.Comparator; -import java.util.PriorityQueue; import static org.apache.hadoop.hive.ql.plan.api.OperatorType.TOPNKEY; @@ -41,11 +40,7 @@ private static final long serialVersionUID = 1L; - // Maximum number of keys to hold - private transient int topN; - - // Priority queue that holds occurred keys - private transient PriorityQueue priorityQueue; + private transient TopNKeyFilter topNKeyFilter; private transient KeyWrapper keyWrapper; @@ -86,7 +81,12 @@ public int compare(KeyWrapper key1, KeyWrapper key2) { protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); - this.topN = conf.getTopN(); + String columnSortOrder = conf.getColumnSortOrder(); + String nullSortOrder = conf.getNullOrder(); + boolean[] columnSortOrderIsDesc = new boolean[columnSortOrder.length()]; + for (int i = 0; i < columnSortOrderIsDesc.length; i++) { + columnSortOrderIsDesc[i] = (columnSortOrder.charAt(i) == '-'); + } ObjectInspector rowInspector = inputObjInspectors[0]; ObjectInspector standardObjInspector = ObjectInspectorUtils.getStandardObjectInspector(rowInspector); @@ -107,13 +107,8 @@ protected void initializeOp(Configuration hconf) throws HiveException { standardKeyObjectInspectors[i] = standardKeyFields[i].initialize(standardObjInspector); } - String columnSortOrder = conf.getColumnSortOrder(); - String nullSortOrder = conf.getNullOrder(); - - // We need a reversed comparator because the PriorityQueue.poll() method is used for filtering out keys. - // Ex.: When ORDER BY key1 ASC then call of poll() should remove the largest key. - priorityQueue = new PriorityQueue<>(topN + 1, - new KeyWrapperComparator(standardKeyObjectInspectors, columnSortOrder, nullSortOrder).reversed()); + this.topNKeyFilter = new TopNKeyFilter<>(conf.getTopN(), new TopNKeyOperator.KeyWrapperComparator( + standardKeyObjectInspectors, columnSortOrder, nullSortOrder)); KeyWrapperFactory keyWrapperFactory = new KeyWrapperFactory(keyFields, keyObjectInspectors, standardKeyObjectInspectors); @@ -122,28 +117,16 @@ protected void initializeOp(Configuration hconf) throws HiveException { @Override public void process(Object row, int tag) throws HiveException { - if (canProcess(row, tag)) { - forward(row, outputObjInspector); - } - } - - protected boolean canProcess(Object row, int tag) throws HiveException { keyWrapper.getNewKey(row, inputObjInspectors[tag]); keyWrapper.setHashKey(); - - if (!priorityQueue.contains(keyWrapper)) { - priorityQueue.offer(keyWrapper.copyKey()); - } - if (priorityQueue.size() > topN) { - priorityQueue.poll(); + if (topNKeyFilter.canForward(keyWrapper)) { + forward(row, outputObjInspector); } - - return priorityQueue.contains(keyWrapper); } @Override protected final void closeOp(boolean abort) throws HiveException { - priorityQueue.clear(); + topNKeyFilter.clear(); super.closeOp(abort); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java index c80bc804a2..5faa038c18 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -21,31 +21,32 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TopNKeyFilter; import org.apache.hadoop.hive.ql.exec.TopNKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.wrapper.VectorHashKeyWrapperBase; +import org.apache.hadoop.hive.ql.exec.vector.wrapper.VectorHashKeyWrapperBatch; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; import org.apache.hadoop.hive.ql.plan.VectorDesc; import org.apache.hadoop.hive.ql.plan.VectorTopNKeyDesc; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; /** * VectorTopNKeyOperator passes rows that contains top N keys only. */ -public class VectorTopNKeyOperator extends TopNKeyOperator implements VectorizationOperator { +public class VectorTopNKeyOperator extends Operator implements VectorizationOperator { private static final long serialVersionUID = 1L; private VectorTopNKeyDesc vectorDesc; private VectorizationContext vContext; - // Extract row - private transient Object[] extractedRow; - private transient VectorExtractRow vectorExtractRow; - // Batch processing private transient int[] temporarySelected; + private transient VectorHashKeyWrapperBatch keyWrappersBatch; + private transient TopNKeyFilter topNKeyFilter; public VectorTopNKeyOperator(CompilationOpContext ctx, OperatorDesc conf, VectorizationContext vContext, VectorDesc vectorDesc) { @@ -70,17 +71,18 @@ public VectorTopNKeyOperator(CompilationOpContext ctx) { protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); - VectorExpression.doTransientInit(vectorDesc.getKeyExpressions(), hconf); - for (VectorExpression keyExpression : vectorDesc.getKeyExpressions()) { + VectorExpression[] keyExpressions = vectorDesc.getKeyExpressions(); + VectorExpression.doTransientInit(keyExpressions, hconf); + for (VectorExpression keyExpression : keyExpressions) { keyExpression.init(hconf); } - vectorExtractRow = new VectorExtractRow(); - vectorExtractRow.init((StructObjectInspector) inputObjInspectors[0], - vContext.getProjectedColumns()); - extractedRow = new Object[vectorExtractRow.getCount()]; - temporarySelected = new int [VectorizedRowBatch.DEFAULT_SIZE]; + + keyWrappersBatch = VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions); + this.topNKeyFilter = new TopNKeyFilter<>(conf.getTopN(), keyWrappersBatch.getComparator( + conf.getColumnSortOrder(), + conf.getNullOrder())); } @Override @@ -99,6 +101,9 @@ public void process(Object data, int tag) throws HiveException { keyExpression.evaluate(batch); } + keyWrappersBatch.evaluateBatch(batch); + VectorHashKeyWrapperBase[] keyWrappers = keyWrappersBatch.getVectorHashKeyWrappers(); + // Filter rows with top n keys int size = 0; int[] selected = new int[batch.selected.length]; @@ -110,11 +115,8 @@ public void process(Object data, int tag) throws HiveException { j = i; } - // Get keys - vectorExtractRow.extractRow(batch, j, extractedRow); - // Select a row in the priority queue - if (canProcess(extractedRow, tag)) { + if (topNKeyFilter.canForward(keyWrappers[i])) { selected[size++] = j; } } @@ -154,4 +156,44 @@ public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveE op.setNextVectorBatchGroupStatus(isLastGroupBatch); } } + + @Override + public String getName() { + return TopNKeyOperator.getOperatorName(); + } + + @Override + public OperatorType getType() { + return OperatorType.TOPNKEY; + } + + @Override + protected void closeOp(boolean abort) throws HiveException { + topNKeyFilter.clear(); + super.closeOp(abort); + } + + // Because a TopNKeyOperator works like a FilterOperator with top n key condition, its properties + // for optimizers has same values. Following methods are same with FilterOperator; + // supportSkewJoinOptimization, columnNamesRowResolvedCanBeObtained, + // supportAutomaticSortMergeJoin, and supportUnionRemoveOptimization. + @Override + public boolean supportSkewJoinOptimization() { + return true; + } + + @Override + public boolean columnNamesRowResolvedCanBeObtained() { + return true; + } + + @Override + public boolean supportAutomaticSortMergeJoin() { + return true; + } + + @Override + public boolean supportUnionRemoveOptimization() { + return true; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java index dd31991d03..0786c82b7b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.exec.vector.wrapper; +import java.util.Comparator; + import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -1072,5 +1074,18 @@ public int getVariableSize(int batchSize) { } return variableSize; } + + public Comparator getComparator(String columnSortOrder, String nullOrder) { + VectorHashKeyWrapperGeneralComparator comparator = + new VectorHashKeyWrapperGeneralComparator(columnVectorTypes.length); + for (int i = 0; i < columnVectorTypes.length; ++i) { + final int columnTypeSpecificIndex = columnTypeSpecificIndices[i]; + ColumnVector.Type columnVectorType = columnVectorTypes[i]; + comparator.addColumnComparator( + i, columnTypeSpecificIndex, columnVectorType, columnSortOrder.charAt(i), nullOrder.charAt(i)); + } + + return comparator; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java new file mode 100644 index 0000000000..063534811e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector.wrapper; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.util.NullOrdering; + +/** + * An implementation of {@link Comparator} to compare {@link VectorHashKeyWrapperBase} instances. + */ +public class VectorHashKeyWrapperGeneralComparator + implements Comparator, Serializable { + + /** + * Compare {@link VectorHashKeyWrapperBase} instances only by one column. + */ + private static class VectorHashKeyWrapperBaseComparator + implements Comparator, Serializable { + + private final int keyIndex; + private final Comparator comparator; + private final int nullResult; + + VectorHashKeyWrapperBaseComparator(int keyIndex, Comparator comparator, char nullOrder) { + this.keyIndex = keyIndex; + this.comparator = comparator; + switch (NullOrdering.fromSign(nullOrder)) { + case NULLS_FIRST: + this. nullResult = 1; + break; + default: + this.nullResult = -1; + } + } + + @Override + public int compare(VectorHashKeyWrapperBase o1, VectorHashKeyWrapperBase o2) { + boolean isNull1 = o1.isNull(keyIndex); + boolean isNull2 = o2.isNull(keyIndex); + + if (isNull1 && isNull2) { + return 0; + } + if (isNull1) { + return -nullResult; + } + if (isNull2) { + return nullResult; + } + return comparator.compare(o1, o2); + } + } + + private final List comparators; + + public VectorHashKeyWrapperGeneralComparator(int numberOfColumns) { + this.comparators = new ArrayList<>(numberOfColumns); + } + + public void addColumnComparator(int keyIndex, int columnTypeSpecificIndex, ColumnVector.Type columnVectorType, + char sortOrder, char nullOrder) { + Comparator comparator; + switch (columnVectorType) { + case LONG: + case DECIMAL_64: + comparator = (o1, o2) -> + Long.compare(o1.getLongValue(columnTypeSpecificIndex), o2.getLongValue(columnTypeSpecificIndex)); + break; + case DOUBLE: + comparator = (o1, o2) -> Double.compare( + o1.getDoubleValue(columnTypeSpecificIndex), o2.getDoubleValue(columnTypeSpecificIndex)); + break; + case BYTES: + comparator = (o1, o2) -> StringExpr.compare( + o1.getBytes(columnTypeSpecificIndex), + o1.getByteStart(columnTypeSpecificIndex), + o1.getByteLength(columnTypeSpecificIndex), + o2.getBytes(columnTypeSpecificIndex), + o2.getByteStart(columnTypeSpecificIndex), + o2.getByteLength(columnTypeSpecificIndex)); + break; + case DECIMAL: + comparator = (o1, o2) -> + o1.getDecimal(columnTypeSpecificIndex).compareTo(o2.getDecimal(columnTypeSpecificIndex)); + break; + case TIMESTAMP: + comparator = (o1, o2) -> + o1.getTimestamp(columnTypeSpecificIndex).compareTo(o2.getTimestamp(columnTypeSpecificIndex)); + break; + case INTERVAL_DAY_TIME: + comparator = (o1, o2) -> o1.getIntervalDayTime(columnTypeSpecificIndex) + .compareTo(o2.getIntervalDayTime(columnTypeSpecificIndex)); + break; + default: + throw new RuntimeException("Unexpected column vector columnVectorType " + columnVectorType); + } + + comparators.add( + new VectorHashKeyWrapperBaseComparator( + keyIndex, + sortOrder == '-' ? comparator : comparator, + nullOrder)); + } + + @Override + public int compare(VectorHashKeyWrapperBase o1, VectorHashKeyWrapperBase o2) { + for (Comparator comparator : comparators) { + int c = comparator.compare(o1, o2); + if (c != 0) { + return c; + } + } + return 0; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 4cc02b4975..bb5f9dfc9e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -4334,7 +4334,7 @@ private boolean usesVectorUDFAdaptor(VectorExpression[] vecExprs) { TopNKeyDesc topNKeyDesc = (TopNKeyDesc) topNKeyOperator.getConf(); List keyColumns = topNKeyDesc.getKeyColumns(); - VectorExpression[] keyExpressions = vContext.getVectorExpressions(keyColumns); + VectorExpression[] keyExpressions = vContext.getVectorExpressionsUpConvertDecimal64(keyColumns); vectorTopNKeyDesc.setKeyExpressions(keyExpressions); return OperatorFactory.getVectorOperator( topNKeyOperator.getCompilationOpContext(), topNKeyDesc, diff --git ql/src/test/queries/clientpositive/topnkey.q ql/src/test/queries/clientpositive/topnkey.q index 283f426f18..057b6a45ba 100644 --- ql/src/test/queries/clientpositive/topnkey.q +++ ql/src/test/queries/clientpositive/topnkey.q @@ -28,57 +28,3 @@ explain vectorization detail SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5; SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5; - -CREATE TABLE t_test( - a int, - b int, - c int -); - -INSERT INTO t_test VALUES -(NULL, NULL, NULL), -(5, 2, 3), -(NULL, NULL, NULL), -(NULL, NULL, NULL), -(6, 2, 1), -(7, 8, 4), (7, 8, 4), (7, 8, 4), -(5, 1, 2), (5, 1, 2), (5, 1, 2), -(NULL, NULL, NULL); - -EXPLAIN -SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3; -SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3; - - -EXPLAIN -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2; - -DROP TABLE IF EXISTS t_test; - -CREATE TABLE t_test( - a int, - b int, - c int -); - -INSERT INTO t_test VALUES -(7, 8, 4), (7, 8, 4), (7, 8, 4), -(NULL, NULL, NULL), -(5, 2, 3), -(NULL, NULL, NULL), -(NULL, NULL, NULL), -(6, 2, 1), -(5, 1, 2), (5, 1, 2), (5, 1, 2), -(NULL, NULL, NULL); - -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2; -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2; - -DROP TABLE IF EXISTS t_test; diff --git ql/src/test/queries/clientpositive/topnkey_order_null.q ql/src/test/queries/clientpositive/topnkey_order_null.q new file mode 100644 index 0000000000..8d04104e76 --- /dev/null +++ ql/src/test/queries/clientpositive/topnkey_order_null.q @@ -0,0 +1,83 @@ +SET hive.vectorized.execution.enabled=false; +SET hive.optimize.topnkey=true; + +CREATE TABLE t_test( + a int, + b int, + c int +); + +INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL); + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2; + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2; + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2; + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2; + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2; + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2; + +DROP TABLE IF EXISTS t_test; + +CREATE TABLE t_test( + a int, + b int, + c int +); + +INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL); + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2; + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2; + +SET hive.vectorized.execution.enabled=false; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2; +SET hive.vectorized.execution.enabled=true; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2; + +DROP TABLE IF EXISTS t_test; diff --git ql/src/test/queries/clientpositive/vector_topnkey.q ql/src/test/queries/clientpositive/vector_topnkey.q index e1b7d26afe..85c5880cd6 100644 --- ql/src/test/queries/clientpositive/vector_topnkey.q +++ ql/src/test/queries/clientpositive/vector_topnkey.q @@ -1,4 +1,3 @@ ---! qt:dataset:src set hive.mapred.mode=nonstrict; set hive.vectorized.execution.enabled=true; set hive.optimize.topnkey=true; @@ -14,17 +13,34 @@ set hive.tez.dynamic.partition.pruning=true; set hive.stats.fetch.column.stats=true; set hive.cbo.enable=true; -explain vectorization detail -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5; - -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5; - -explain vectorization detail -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5; - -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5; - -explain vectorization detail -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5; - -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5; +CREATE TABLE t_test( + cint1 int, + cint2 int, + cdouble double, + cvarchar varchar(50), + cdecimal1 decimal(10,2), + cdecimal2 decimal(38,5) +); + +INSERT INTO t_test VALUES +(NULL, NULL, NULL, NULL, NULL, NULL), +(8, 9, 2.0, 'one', 2.0, 2.0), (8, 9, 2.0, 'one', 2.0, 2.0), +(4, 2, 3.3, 'two', 3.3, 3.3), +(NULL, NULL, NULL, NULL, NULL, NULL), +(NULL, NULL, NULL, NULL, NULL, NULL), +(6, 2, 1.8, 'three', 1.8, 1.8), +(7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), +(4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), +(NULL, NULL, NULL, NULL, NULL, NULL); + +EXPLAIN VECTORIZATION DETAIL +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3; + +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3; +SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1, cint2 LIMIT 3; +SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1 DESC, cint2 LIMIT 3; +SELECT cint1, cdouble FROM t_test GROUP BY cint1, cdouble ORDER BY cint1, cdouble LIMIT 3; +SELECT cvarchar, cdouble FROM t_test GROUP BY cvarchar, cdouble ORDER BY cvarchar, cdouble LIMIT 3; +SELECT cdecimal1, cdecimal2 FROM t_test GROUP BY cdecimal1, cdecimal2 ORDER BY cdecimal1, cdecimal2 LIMIT 3; + +DROP TABLE t_test; \ No newline at end of file diff --git ql/src/test/results/clientpositive/llap/topnkey.q.out ql/src/test/results/clientpositive/llap/topnkey.q.out index cd47e9d223..30b08ee1d1 100644 --- ql/src/test/results/clientpositive/llap/topnkey.q.out +++ ql/src/test/results/clientpositive/llap/topnkey.q.out @@ -415,386 +415,3 @@ POSTHOOK: Input: default@src 0 val_0 0 val_0 0 val_0 -PREHOOK: query: CREATE TABLE t_test( - a int, - b int, - c int -) -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@t_test -POSTHOOK: query: CREATE TABLE t_test( - a int, - b int, - c int -) -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: database:default -POSTHOOK: Output: default@t_test -PREHOOK: query: INSERT INTO t_test VALUES -(NULL, NULL, NULL), -(5, 2, 3), -(NULL, NULL, NULL), -(NULL, NULL, NULL), -(6, 2, 1), -(7, 8, 4), (7, 8, 4), (7, 8, 4), -(5, 1, 2), (5, 1, 2), (5, 1, 2), -(NULL, NULL, NULL) -PREHOOK: type: QUERY -PREHOOK: Input: _dummy_database@_dummy_table -PREHOOK: Output: default@t_test -POSTHOOK: query: INSERT INTO t_test VALUES -(NULL, NULL, NULL), -(5, 2, 3), -(NULL, NULL, NULL), -(NULL, NULL, NULL), -(6, 2, 1), -(7, 8, 4), (7, 8, 4), (7, 8, 4), -(5, 1, 2), (5, 1, 2), (5, 1, 2), -(NULL, NULL, NULL) -POSTHOOK: type: QUERY -POSTHOOK: Input: _dummy_database@_dummy_table -POSTHOOK: Output: default@t_test -POSTHOOK: Lineage: t_test.a SCRIPT [] -POSTHOOK: Lineage: t_test.b SCRIPT [] -POSTHOOK: Lineage: t_test.c SCRIPT [] -PREHOOK: query: EXPLAIN -SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: EXPLAIN -SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-1 - Tez -#### A masked pattern was here #### - Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) -#### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: t_test - Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: a (type: int), b (type: int) - outputColumnNames: a, b - Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE - Top N Key Operator - sort order: ++ - keys: a (type: int), b (type: int) - Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE - top n: 3 - Group By Operator - keys: a (type: int), b (type: int) - minReductionHashAggr: 0.3333333 - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: int), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col0 (type: int), _col1 (type: int) - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - Execution mode: llap - LLAP IO: no inputs - Reducer 2 - Execution mode: llap - Reduce Operator Tree: - Group By Operator - keys: KEY._col0 (type: int), KEY._col1 (type: int) - mode: mergepartial - outputColumnNames: _col0, _col1 - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: int), _col1 (type: int) - sort order: ++ - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - Reducer 3 - Execution mode: llap - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: int) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 3 - Statistics: Num rows: 3 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - Statistics: Num rows: 3 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 3 - Processor Tree: - ListSink - -PREHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -5 1 -5 2 -6 2 -PREHOOK: query: EXPLAIN -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: EXPLAIN -SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-1 - Tez -#### A masked pattern was here #### - Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) -#### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: t_test - Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: a (type: int), b (type: int) - outputColumnNames: a, b - Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE - Top N Key Operator - sort order: + - keys: a (type: int) - Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE - top n: 2 - Group By Operator - aggregations: count(b) - keys: a (type: int) - minReductionHashAggr: 0.6666666 - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Execution mode: llap - LLAP IO: no inputs - Reducer 2 - Execution mode: llap - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - keys: KEY._col0 (type: int) - mode: mergepartial - outputColumnNames: _col0, _col1 - Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Reducer 3 - Execution mode: llap - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 2 - Statistics: Num rows: 2 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - Statistics: Num rows: 2 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 2 - Processor Tree: - ListSink - -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -5 4 -6 1 -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -NULL 0 -5 4 -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -5 4 -6 1 -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -5 4 -6 1 -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -NULL 0 -5 4 -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -5 4 -6 1 -PREHOOK: query: DROP TABLE IF EXISTS t_test -PREHOOK: type: DROPTABLE -PREHOOK: Input: default@t_test -PREHOOK: Output: default@t_test -POSTHOOK: query: DROP TABLE IF EXISTS t_test -POSTHOOK: type: DROPTABLE -POSTHOOK: Input: default@t_test -POSTHOOK: Output: default@t_test -PREHOOK: query: CREATE TABLE t_test( - a int, - b int, - c int -) -PREHOOK: type: CREATETABLE -PREHOOK: Output: database:default -PREHOOK: Output: default@t_test -POSTHOOK: query: CREATE TABLE t_test( - a int, - b int, - c int -) -POSTHOOK: type: CREATETABLE -POSTHOOK: Output: database:default -POSTHOOK: Output: default@t_test -PREHOOK: query: INSERT INTO t_test VALUES -(7, 8, 4), (7, 8, 4), (7, 8, 4), -(NULL, NULL, NULL), -(5, 2, 3), -(NULL, NULL, NULL), -(NULL, NULL, NULL), -(6, 2, 1), -(5, 1, 2), (5, 1, 2), (5, 1, 2), -(NULL, NULL, NULL) -PREHOOK: type: QUERY -PREHOOK: Input: _dummy_database@_dummy_table -PREHOOK: Output: default@t_test -POSTHOOK: query: INSERT INTO t_test VALUES -(7, 8, 4), (7, 8, 4), (7, 8, 4), -(NULL, NULL, NULL), -(5, 2, 3), -(NULL, NULL, NULL), -(NULL, NULL, NULL), -(6, 2, 1), -(5, 1, 2), (5, 1, 2), (5, 1, 2), -(NULL, NULL, NULL) -POSTHOOK: type: QUERY -POSTHOOK: Input: _dummy_database@_dummy_table -POSTHOOK: Output: default@t_test -POSTHOOK: Lineage: t_test.a SCRIPT [] -POSTHOOK: Lineage: t_test.b SCRIPT [] -POSTHOOK: Lineage: t_test.c SCRIPT [] -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -7 3 -6 1 -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -NULL 0 -7 3 -PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 -PREHOOK: type: QUERY -PREHOOK: Input: default@t_test -#### A masked pattern was here #### -POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@t_test -#### A masked pattern was here #### -7 3 -6 1 -PREHOOK: query: DROP TABLE IF EXISTS t_test -PREHOOK: type: DROPTABLE -PREHOOK: Input: default@t_test -PREHOOK: Output: default@t_test -POSTHOOK: query: DROP TABLE IF EXISTS t_test -POSTHOOK: type: DROPTABLE -POSTHOOK: Input: default@t_test -POSTHOOK: Output: default@t_test diff --git ql/src/test/results/clientpositive/llap/vector_topnkey.q.out ql/src/test/results/clientpositive/llap/vector_topnkey.q.out index d859270ff0..df4eba1587 100644 --- ql/src/test/results/clientpositive/llap/vector_topnkey.q.out +++ ql/src/test/results/clientpositive/llap/vector_topnkey.q.out @@ -1,212 +1,66 @@ -PREHOOK: query: explain vectorization detail -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: query: CREATE TABLE t_test( + cint1 int, + cint2 int, + cdouble double, + cvarchar varchar(50), + cdecimal1 decimal(10,2), + cdecimal2 decimal(38,5) +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + cint1 int, + cint2 int, + cdouble double, + cvarchar varchar(50), + cdecimal1 decimal(10,2), + cdecimal2 decimal(38,5) +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL, NULL, NULL, NULL), +(8, 9, 2.0, 'one', 2.0, 2.0), (8, 9, 2.0, 'one', 2.0, 2.0), +(4, 2, 3.3, 'two', 3.3, 3.3), +(NULL, NULL, NULL, NULL, NULL, NULL), +(NULL, NULL, NULL, NULL, NULL, NULL), +(6, 2, 1.8, 'three', 1.8, 1.8), +(7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), +(4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), +(NULL, NULL, NULL, NULL, NULL, NULL) PREHOOK: type: QUERY -PREHOOK: Input: default@src -#### A masked pattern was here #### -POSTHOOK: query: explain vectorization detail -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -#### A masked pattern was here #### -PLAN VECTORIZATION: - enabled: true - enabledConditionsMet: [hive.vectorized.execution.enabled IS true] - -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-1 - Tez -#### A masked pattern was here #### - Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) -#### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: src - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] - Select Operator - expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 4] - selectExpressions: CastStringToLong(col 3:string)(children: StringSubstrColStart(col 1:string, start 4) -> 3:string) -> 4:int - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Top N Key Operator - sort order: + - keys: _col0 (type: string) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - top n: 5 - Top N Key Vectorization: - className: VectorTopNKeyOperator - keyExpressions: col 0:string - native: true - Group By Operator - aggregations: sum(_col1) - Group By Vectorization: - aggregators: VectorUDAFSumLong(col 4:int) -> bigint - className: VectorGroupByOperator - groupByMode: HASH - keyExpressions: col 0:string - native: false - vectorProcessingMode: HASH - projectedOutputColumnNums: [0] - keys: _col0 (type: string) - minReductionHashAggr: 0.5 - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - valueColumns: 1:bigint - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Execution mode: vectorized, llap - LLAP IO: no inputs - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - featureSupportInUse: [DECIMAL_64] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0, 1] - dataColumns: key:string, value:string - partitionColumnCount: 0 - scratchColumnTypeNames: [string, bigint] - Reducer 2 - Execution mode: vectorized, llap - Reduce Vectorization: - enabled: true - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true - reduceColumnNullOrder: z - reduceColumnSortOrder: + - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - dataColumns: KEY._col0:string, VALUE._col0:bigint - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Operator Tree: - Group By Operator - aggregations: sum(VALUE._col0) - Group By Vectorization: - aggregators: VectorUDAFSumLong(col 1:bigint) -> bigint - className: VectorGroupByOperator - groupByMode: MERGEPARTIAL - keyExpressions: col 0:string - native: false - vectorProcessingMode: MERGE_PARTIAL - projectedOutputColumnNums: [0] - keys: KEY._col0 (type: string) - mode: mergepartial - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - valueColumns: 1:bigint - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Reducer 3 - Execution mode: vectorized, llap - Reduce Vectorization: - enabled: true - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true - reduceColumnNullOrder: z - reduceColumnSortOrder: + - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - dataColumns: KEY.reducesinkkey0:string, VALUE._col0:bigint - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 1] - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Limit Vectorization: - className: VectorLimitOperator - native: true - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - File Sink Vectorization: - className: VectorFileSinkOperator - native: false - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 5 - Processor Tree: - ListSink - -PREHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 -PREHOOK: type: QUERY -PREHOOK: Input: default@src -#### A masked pattern was here #### -POSTHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL, NULL, NULL, NULL), +(8, 9, 2.0, 'one', 2.0, 2.0), (8, 9, 2.0, 'one', 2.0, 2.0), +(4, 2, 3.3, 'two', 3.3, 3.3), +(NULL, NULL, NULL, NULL, NULL, NULL), +(NULL, NULL, NULL, NULL, NULL, NULL), +(6, 2, 1.8, 'three', 1.8, 1.8), +(7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), +(4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), +(NULL, NULL, NULL, NULL, NULL, NULL) POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -#### A masked pattern was here #### -0 0 -10 10 -100 200 -103 206 -104 208 -PREHOOK: query: explain vectorization detail -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.cdecimal1 SCRIPT [] +POSTHOOK: Lineage: t_test.cdecimal2 SCRIPT [] +POSTHOOK: Lineage: t_test.cdouble SCRIPT [] +POSTHOOK: Lineage: t_test.cint1 SCRIPT [] +POSTHOOK: Lineage: t_test.cint2 SCRIPT [] +POSTHOOK: Lineage: t_test.cvarchar SCRIPT [] +PREHOOK: query: EXPLAIN VECTORIZATION DETAIL +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: explain vectorization detail -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### PLAN VECTORIZATION: enabled: true @@ -228,51 +82,51 @@ STAGE PLANS: Map 1 Map Operator Tree: TableScan - alias: src - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + alias: t_test + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE TableScan Vectorization: native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] + vectorizationSchemaColumns: [0:cint1:int, 1:cint2:int, 2:cdouble:double, 3:cvarchar:varchar(50), 4:cdecimal1:decimal(10,2)/DECIMAL_64, 5:cdecimal2:decimal(38,5), 6:ROW__ID:struct] Select Operator - expressions: key (type: string) - outputColumnNames: key + expressions: cint1 (type: int) + outputColumnNames: cint1 Select Vectorization: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE Top N Key Operator sort order: + - keys: key (type: string) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - top n: 5 + keys: cint1 (type: int) + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + top n: 3 Top N Key Vectorization: className: VectorTopNKeyOperator - keyExpressions: col 0:string + keyExpressions: col 0:int native: true Group By Operator Group By Vectorization: className: VectorGroupByOperator groupByMode: HASH - keyExpressions: col 0:string + keyExpressions: col 0:int native: false vectorProcessingMode: HASH projectedOutputColumnNums: [] - keys: key (type: string) - minReductionHashAggr: 0.5 + keys: cint1 (type: int) + minReductionHashAggr: 0.64285713 mode: hash outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: _col0 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col0 (type: string) + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string + className: VectorReduceSinkLongOperator + keyColumns: 0:int native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE TopN Hash Memory Usage: 0.1 Execution mode: vectorized, llap LLAP IO: no inputs @@ -286,9 +140,9 @@ STAGE PLANS: usesVectorUDFAdaptor: false vectorized: true rowBatchContext: - dataColumnCount: 2 + dataColumnCount: 6 includeColumns: [0] - dataColumns: key:string, value:string + dataColumns: cint1:int, cint2:int, cdouble:double, cvarchar:varchar(50), cdecimal1:decimal(10,2)/DECIMAL_64, cdecimal2:decimal(38,5) partitionColumnCount: 0 scratchColumnTypeNames: [] Reducer 2 @@ -303,7 +157,7 @@ STAGE PLANS: vectorized: true rowBatchContext: dataColumnCount: 1 - dataColumns: KEY._col0:string + dataColumns: KEY._col0:int partitionColumnCount: 0 scratchColumnTypeNames: [] Reduce Operator Tree: @@ -311,23 +165,23 @@ STAGE PLANS: Group By Vectorization: className: VectorGroupByOperator groupByMode: MERGEPARTIAL - keyExpressions: col 0:string + keyExpressions: col 0:int native: false vectorProcessingMode: MERGE_PARTIAL projectedOutputColumnNums: [] - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: int) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: _col0 (type: string) + key expressions: _col0 (type: int) sort order: + Reduce Sink Vectorization: className: VectorReduceSinkObjectHashOperator - keyColumns: 0:string + keyColumns: 0:int native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE TopN Hash Memory Usage: 0.1 Reducer 3 Execution mode: vectorized, llap @@ -341,30 +195,30 @@ STAGE PLANS: vectorized: true rowBatchContext: dataColumnCount: 1 - dataColumns: KEY.reducesinkkey0:string + dataColumns: KEY.reducesinkkey0:int partitionColumnCount: 0 scratchColumnTypeNames: [] Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: string) + expressions: KEY.reducesinkkey0 (type: int) outputColumnNames: _col0 Select Vectorization: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE Limit - Number of rows: 5 + Number of rows: 3 Limit Vectorization: className: VectorLimitOperator native: true - Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -372,235 +226,81 @@ STAGE PLANS: Stage: Stage-0 Fetch Operator - limit: 5 + limit: 3 Processor Tree: ListSink -PREHOOK: query: SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: query: SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: query: SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### -0 -10 -100 -103 -104 -PREHOOK: query: explain vectorization detail -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +4 +6 +7 +PREHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1, cint2 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: explain vectorization detail -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +POSTHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1, cint2 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### -PLAN VECTORIZATION: - enabled: true - enabledConditionsMet: [hive.vectorized.execution.enabled IS true] - -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-1 - Tez +4 1 +4 2 +6 2 +PREHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1 DESC, cint2 LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test #### A masked pattern was here #### - Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +POSTHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1 DESC, cint2 LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test #### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: src1 - filterExpr: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] - Filter Operator - Filter Vectorization: - className: VectorFilterOperator - native: true - predicateExpression: SelectColumnIsNotNull(col 0:string) - predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: key (type: string) - outputColumnNames: _col0 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0] - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Execution mode: vectorized, llap - LLAP IO: no inputs - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - featureSupportInUse: [DECIMAL_64] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat - allNative: true - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0] - dataColumns: key:string, value:string - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Map 4 - Map Operator Tree: - TableScan - alias: src2 - filterExpr: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] - Filter Operator - Filter Vectorization: - className: VectorFilterOperator - native: true - predicateExpression: SelectColumnIsNotNull(col 0:string) - predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: key (type: string), value (type: string) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 1] - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - valueColumns: 1:string - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: string) - Execution mode: vectorized, llap - LLAP IO: no inputs - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - featureSupportInUse: [DECIMAL_64] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat - allNative: true - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0, 1] - dataColumns: key:string, value:string - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reducer 2 - Execution mode: llap - Reduce Operator Tree: - Merge Join Operator - condition map: - Inner Join 0 to 1 - keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - outputColumnNames: _col0, _col2 - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: _col0 (type: string), _col2 (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: string) - MergeJoin Vectorization: - enabled: false - enableConditionsNotMet: Vectorizing MergeJoin Supported IS false - Reducer 3 - Execution mode: vectorized, llap - Reduce Vectorization: - enabled: true - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true - reduceColumnNullOrder: z - reduceColumnSortOrder: + - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - dataColumns: KEY.reducesinkkey0:string, VALUE._col0:string - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: string) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 1] - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Limit Vectorization: - className: VectorLimitOperator - native: true - Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - File Sink Vectorization: - className: VectorFileSinkOperator - native: false - Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 5 - Processor Tree: - ListSink - -PREHOOK: query: SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +8 9 +7 8 +6 2 +PREHOOK: query: SELECT cint1, cdouble FROM t_test GROUP BY cint1, cdouble ORDER BY cint1, cdouble LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT cint1, cdouble FROM t_test GROUP BY cint1, cdouble ORDER BY cint1, cdouble LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +4 2.0 +4 3.3 +6 1.8 +PREHOOK: query: SELECT cvarchar, cdouble FROM t_test GROUP BY cvarchar, cdouble ORDER BY cvarchar, cdouble LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT cvarchar, cdouble FROM t_test GROUP BY cvarchar, cdouble ORDER BY cvarchar, cdouble LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +five 2.0 +four 4.5 +one 2.0 +PREHOOK: query: SELECT cdecimal1, cdecimal2 FROM t_test GROUP BY cdecimal1, cdecimal2 ORDER BY cdecimal1, cdecimal2 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +POSTHOOK: query: SELECT cdecimal1, cdecimal2 FROM t_test GROUP BY cdecimal1, cdecimal2 ORDER BY cdecimal1, cdecimal2 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### -0 val_0 -0 val_0 -0 val_0 -0 val_0 -0 val_0 +1.80 1.80000 +2.00 2.00000 +3.30 3.30000 +PREHOOK: query: DROP TABLE t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test diff --git ql/src/test/results/clientpositive/tez/vector_topnkey.q.out ql/src/test/results/clientpositive/tez/vector_topnkey.q.out index b6760db156..12c0d4cbac 100644 --- ql/src/test/results/clientpositive/tez/vector_topnkey.q.out +++ ql/src/test/results/clientpositive/tez/vector_topnkey.q.out @@ -1,211 +1,66 @@ -PREHOOK: query: explain vectorization detail -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: query: CREATE TABLE t_test( + cint1 int, + cint2 int, + cdouble double, + cvarchar varchar(50), + cdecimal1 decimal(10,2), + cdecimal2 decimal(38,5) +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + cint1 int, + cint2 int, + cdouble double, + cvarchar varchar(50), + cdecimal1 decimal(10,2), + cdecimal2 decimal(38,5) +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL, NULL, NULL, NULL), +(8, 9, 2.0, 'one', 2.0, 2.0), (8, 9, 2.0, 'one', 2.0, 2.0), +(4, 2, 3.3, 'two', 3.3, 3.3), +(NULL, NULL, NULL, NULL, NULL, NULL), +(NULL, NULL, NULL, NULL, NULL, NULL), +(6, 2, 1.8, 'three', 1.8, 1.8), +(7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), +(4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), +(NULL, NULL, NULL, NULL, NULL, NULL) PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: explain vectorization detail -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL, NULL, NULL, NULL), +(8, 9, 2.0, 'one', 2.0, 2.0), (8, 9, 2.0, 'one', 2.0, 2.0), +(4, 2, 3.3, 'two', 3.3, 3.3), +(NULL, NULL, NULL, NULL, NULL, NULL), +(NULL, NULL, NULL, NULL, NULL, NULL), +(6, 2, 1.8, 'three', 1.8, 1.8), +(7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), +(4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), +(NULL, NULL, NULL, NULL, NULL, NULL) POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: hdfs://### HDFS PATH ### -PLAN VECTORIZATION: - enabled: true - enabledConditionsMet: [hive.vectorized.execution.enabled IS true] - -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-1 - Tez -#### A masked pattern was here #### - Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) -#### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: src - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] - Select Operator - expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 4] - selectExpressions: CastStringToLong(col 3:string)(children: StringSubstrColStart(col 1:string, start 4) -> 3:string) -> 4:int - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Top N Key Operator - sort order: + - keys: _col0 (type: string) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - top n: 5 - Top N Key Vectorization: - className: VectorTopNKeyOperator - keyExpressions: col 0:string - native: true - Group By Operator - aggregations: sum(_col1) - Group By Vectorization: - aggregators: VectorUDAFSumLong(col 4:int) -> bigint - className: VectorGroupByOperator - groupByMode: HASH - keyExpressions: col 0:string - native: false - vectorProcessingMode: HASH - projectedOutputColumnNums: [0] - keys: _col0 (type: string) - minReductionHashAggr: 0.5 - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - valueColumns: 1:bigint - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Execution mode: vectorized - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - featureSupportInUse: [DECIMAL_64] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0, 1] - dataColumns: key:string, value:string - partitionColumnCount: 0 - scratchColumnTypeNames: [string, bigint] - Reducer 2 - Execution mode: vectorized - Reduce Vectorization: - enabled: true - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true - reduceColumnNullOrder: z - reduceColumnSortOrder: + - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - dataColumns: KEY._col0:string, VALUE._col0:bigint - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Operator Tree: - Group By Operator - aggregations: sum(VALUE._col0) - Group By Vectorization: - aggregators: VectorUDAFSumLong(col 1:bigint) -> bigint - className: VectorGroupByOperator - groupByMode: MERGEPARTIAL - keyExpressions: col 0:string - native: false - vectorProcessingMode: MERGE_PARTIAL - projectedOutputColumnNums: [0] - keys: KEY._col0 (type: string) - mode: mergepartial - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Reduce Sink Vectorization: - className: VectorReduceSinkObjectHashOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - valueColumns: 1:bigint - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Reducer 3 - Execution mode: vectorized - Reduce Vectorization: - enabled: true - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true - reduceColumnNullOrder: z - reduceColumnSortOrder: + - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - dataColumns: KEY.reducesinkkey0:string, VALUE._col0:bigint - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 1] - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Limit Vectorization: - className: VectorLimitOperator - native: true - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - File Sink Vectorization: - className: VectorFileSinkOperator - native: false - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 5 - Processor Tree: - ListSink - -PREHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.cdecimal1 SCRIPT [] +POSTHOOK: Lineage: t_test.cdecimal2 SCRIPT [] +POSTHOOK: Lineage: t_test.cdouble SCRIPT [] +POSTHOOK: Lineage: t_test.cint1 SCRIPT [] +POSTHOOK: Lineage: t_test.cint2 SCRIPT [] +POSTHOOK: Lineage: t_test.cvarchar SCRIPT [] +PREHOOK: query: EXPLAIN VECTORIZATION DETAIL +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: hdfs://### HDFS PATH ### -0 0 -10 10 -100 200 -103 206 -104 208 -PREHOOK: query: explain vectorization detail -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: explain vectorization detail -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test POSTHOOK: Output: hdfs://### HDFS PATH ### PLAN VECTORIZATION: enabled: true @@ -227,51 +82,51 @@ STAGE PLANS: Map 1 Map Operator Tree: TableScan - alias: src - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + alias: t_test + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE TableScan Vectorization: native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] + vectorizationSchemaColumns: [0:cint1:int, 1:cint2:int, 2:cdouble:double, 3:cvarchar:varchar(50), 4:cdecimal1:decimal(10,2)/DECIMAL_64, 5:cdecimal2:decimal(38,5), 6:ROW__ID:struct] Select Operator - expressions: key (type: string) - outputColumnNames: key + expressions: cint1 (type: int) + outputColumnNames: cint1 Select Vectorization: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE Top N Key Operator sort order: + - keys: key (type: string) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - top n: 5 + keys: cint1 (type: int) + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + top n: 3 Top N Key Vectorization: className: VectorTopNKeyOperator - keyExpressions: col 0:string + keyExpressions: col 0:int native: true Group By Operator Group By Vectorization: className: VectorGroupByOperator groupByMode: HASH - keyExpressions: col 0:string + keyExpressions: col 0:int native: false vectorProcessingMode: HASH projectedOutputColumnNums: [] - keys: key (type: string) - minReductionHashAggr: 0.5 + keys: cint1 (type: int) + minReductionHashAggr: 0.64285713 mode: hash outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: _col0 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col0 (type: string) + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string + className: VectorReduceSinkLongOperator + keyColumns: 0:int native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE TopN Hash Memory Usage: 0.1 Execution mode: vectorized Map Vectorization: @@ -284,9 +139,9 @@ STAGE PLANS: usesVectorUDFAdaptor: false vectorized: true rowBatchContext: - dataColumnCount: 2 + dataColumnCount: 6 includeColumns: [0] - dataColumns: key:string, value:string + dataColumns: cint1:int, cint2:int, cdouble:double, cvarchar:varchar(50), cdecimal1:decimal(10,2)/DECIMAL_64, cdecimal2:decimal(38,5) partitionColumnCount: 0 scratchColumnTypeNames: [] Reducer 2 @@ -301,7 +156,7 @@ STAGE PLANS: vectorized: true rowBatchContext: dataColumnCount: 1 - dataColumns: KEY._col0:string + dataColumns: KEY._col0:int partitionColumnCount: 0 scratchColumnTypeNames: [] Reduce Operator Tree: @@ -309,23 +164,23 @@ STAGE PLANS: Group By Vectorization: className: VectorGroupByOperator groupByMode: MERGEPARTIAL - keyExpressions: col 0:string + keyExpressions: col 0:int native: false vectorProcessingMode: MERGE_PARTIAL projectedOutputColumnNums: [] - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: int) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: _col0 (type: string) + key expressions: _col0 (type: int) sort order: + Reduce Sink Vectorization: className: VectorReduceSinkObjectHashOperator - keyColumns: 0:string + keyColumns: 0:int native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE TopN Hash Memory Usage: 0.1 Reducer 3 Execution mode: vectorized @@ -339,30 +194,30 @@ STAGE PLANS: vectorized: true rowBatchContext: dataColumnCount: 1 - dataColumns: KEY.reducesinkkey0:string + dataColumns: KEY.reducesinkkey0:int partitionColumnCount: 0 scratchColumnTypeNames: [] Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: string) + expressions: KEY.reducesinkkey0 (type: int) outputColumnNames: _col0 Select Vectorization: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE Limit - Number of rows: 5 + Number of rows: 3 Limit Vectorization: className: VectorLimitOperator native: true - Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -370,232 +225,81 @@ STAGE PLANS: Stage: Stage-0 Fetch Operator - limit: 5 + limit: 3 Processor Tree: ListSink -PREHOOK: query: SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: query: SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: query: SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test POSTHOOK: Output: hdfs://### HDFS PATH ### -0 -10 -100 -103 -104 -PREHOOK: query: explain vectorization detail -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +4 +6 +7 +PREHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1, cint2 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: explain vectorization detail -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +POSTHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1, cint2 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test POSTHOOK: Output: hdfs://### HDFS PATH ### -PLAN VECTORIZATION: - enabled: true - enabledConditionsMet: [hive.vectorized.execution.enabled IS true] - -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-1 - Tez -#### A masked pattern was here #### - Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) -#### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: src1 - filterExpr: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] - Filter Operator - Filter Vectorization: - className: VectorFilterOperator - native: true - predicateExpression: SelectColumnIsNotNull(col 0:string) - predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: key (type: string) - outputColumnNames: _col0 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0] - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Execution mode: vectorized - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - featureSupportInUse: [DECIMAL_64] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat - allNative: true - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0] - dataColumns: key:string, value:string - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Map 4 - Map Operator Tree: - TableScan - alias: src2 - filterExpr: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] - Filter Operator - Filter Vectorization: - className: VectorFilterOperator - native: true - predicateExpression: SelectColumnIsNotNull(col 0:string) - predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: key (type: string), value (type: string) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 1] - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Reduce Sink Vectorization: - className: VectorReduceSinkStringOperator - keyColumns: 0:string - native: true - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - valueColumns: 1:string - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: string) - Execution mode: vectorized - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - featureSupportInUse: [DECIMAL_64] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat - allNative: true - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0, 1] - dataColumns: key:string, value:string - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reducer 2 - Reduce Operator Tree: - Merge Join Operator - condition map: - Inner Join 0 to 1 - keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - outputColumnNames: _col0, _col2 - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: _col0 (type: string), _col2 (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: string) - MergeJoin Vectorization: - enabled: false - enableConditionsNotMet: Vectorizing MergeJoin Supported IS false - Reducer 3 - Execution mode: vectorized - Reduce Vectorization: - enabled: true - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true - reduceColumnNullOrder: z - reduceColumnSortOrder: + - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - dataColumns: KEY.reducesinkkey0:string, VALUE._col0:string - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: string) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 1] - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Limit Vectorization: - className: VectorLimitOperator - native: true - Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - File Sink Vectorization: - className: VectorFileSinkOperator - native: false - Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 5 - Processor Tree: - ListSink - -PREHOOK: query: SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +4 1 +4 2 +6 2 +PREHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1 DESC, cint2 LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1 DESC, cint2 LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +8 9 +7 8 +6 2 +PREHOOK: query: SELECT cint1, cdouble FROM t_test GROUP BY cint1, cdouble ORDER BY cint1, cdouble LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT cint1, cdouble FROM t_test GROUP BY cint1, cdouble ORDER BY cint1, cdouble LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +4 2.0 +4 3.3 +6 1.8 +PREHOOK: query: SELECT cvarchar, cdouble FROM t_test GROUP BY cvarchar, cdouble ORDER BY cvarchar, cdouble LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT cvarchar, cdouble FROM t_test GROUP BY cvarchar, cdouble ORDER BY cvarchar, cdouble LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +five 2.0 +four 4.5 +one 2.0 +PREHOOK: query: SELECT cdecimal1, cdecimal2 FROM t_test GROUP BY cdecimal1, cdecimal2 ORDER BY cdecimal1, cdecimal2 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +POSTHOOK: query: SELECT cdecimal1, cdecimal2 FROM t_test GROUP BY cdecimal1, cdecimal2 ORDER BY cdecimal1, cdecimal2 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test POSTHOOK: Output: hdfs://### HDFS PATH ### -0 val_0 -0 val_0 -0 val_0 -0 val_0 -0 val_0 +1.80 1.80000 +2.00 2.00000 +3.30 3.30000 +PREHOOK: query: DROP TABLE t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test diff --git ql/src/test/results/clientpositive/vector_topnkey.q.out ql/src/test/results/clientpositive/vector_topnkey.q.out index 3438be2dc0..dc2caf7d2f 100644 --- ql/src/test/results/clientpositive/vector_topnkey.q.out +++ ql/src/test/results/clientpositive/vector_topnkey.q.out @@ -1,184 +1,66 @@ -PREHOOK: query: explain vectorization detail -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: query: CREATE TABLE t_test( + cint1 int, + cint2 int, + cdouble double, + cvarchar varchar(50), + cdecimal1 decimal(10,2), + cdecimal2 decimal(38,5) +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + cint1 int, + cint2 int, + cdouble double, + cvarchar varchar(50), + cdecimal1 decimal(10,2), + cdecimal2 decimal(38,5) +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL, NULL, NULL, NULL), +(8, 9, 2.0, 'one', 2.0, 2.0), (8, 9, 2.0, 'one', 2.0, 2.0), +(4, 2, 3.3, 'two', 3.3, 3.3), +(NULL, NULL, NULL, NULL, NULL, NULL), +(NULL, NULL, NULL, NULL, NULL, NULL), +(6, 2, 1.8, 'three', 1.8, 1.8), +(7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), +(4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), +(NULL, NULL, NULL, NULL, NULL, NULL) PREHOOK: type: QUERY -PREHOOK: Input: default@src -#### A masked pattern was here #### -POSTHOOK: query: explain vectorization detail -SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL, NULL, NULL, NULL), +(8, 9, 2.0, 'one', 2.0, 2.0), (8, 9, 2.0, 'one', 2.0, 2.0), +(4, 2, 3.3, 'two', 3.3, 3.3), +(NULL, NULL, NULL, NULL, NULL, NULL), +(NULL, NULL, NULL, NULL, NULL, NULL), +(6, 2, 1.8, 'three', 1.8, 1.8), +(7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), (7, 8, 4.5, 'four', 4.5, 4.5), +(4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), (4, 1, 2.0, 'five', 2.0, 2.0), +(NULL, NULL, NULL, NULL, NULL, NULL) POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -#### A masked pattern was here #### -PLAN VECTORIZATION: - enabled: true - enabledConditionsMet: [hive.vectorized.execution.enabled IS true] - -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - -STAGE PLANS: - Stage: Stage-1 - Map Reduce - Map Operator Tree: - TableScan - alias: src - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] - Select Operator - expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int) - outputColumnNames: _col0, _col1 - Select Vectorization: - className: VectorSelectOperator - native: true - projectedOutputColumnNums: [0, 4] - selectExpressions: CastStringToLong(col 3:string)(children: StringSubstrColStart(col 1:string, start 4) -> 3:string) -> 4:int - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Group By Operator - aggregations: sum(_col1) - Group By Vectorization: - aggregators: VectorUDAFSumLong(col 4:int) -> bigint - className: VectorGroupByOperator - groupByMode: HASH - keyExpressions: col 0:string - native: false - vectorProcessingMode: HASH - projectedOutputColumnNums: [0] - keys: _col0 (type: string) - minReductionHashAggr: 0.99 - mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Reduce Sink Vectorization: - className: VectorReduceSinkOperator - native: false - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Execution mode: vectorized - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [DECIMAL_64] - featureSupportInUse: [DECIMAL_64] - inputFileFormats: org.apache.hadoop.mapred.TextInputFormat - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0, 1] - dataColumns: key:string, value:string - partitionColumnCount: 0 - scratchColumnTypeNames: [string, bigint] - Reduce Vectorization: - enabled: false - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true - enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Reduce Operator Tree: - Group By Operator - aggregations: sum(VALUE._col0) - keys: KEY._col0 (type: string) - mode: mergepartial - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:_col0:string, 1:_col1:bigint] - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Reduce Sink Vectorization: - className: VectorReduceSinkOperator - native: false - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: bigint) - Execution mode: vectorized - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [] - featureSupportInUse: [] - inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0, 1] - dataColumns: _col0:string, _col1:bigint - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Vectorization: - enabled: false - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true - enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 5 - Processor Tree: - ListSink - -PREHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.cdecimal1 SCRIPT [] +POSTHOOK: Lineage: t_test.cdecimal2 SCRIPT [] +POSTHOOK: Lineage: t_test.cdouble SCRIPT [] +POSTHOOK: Lineage: t_test.cint1 SCRIPT [] +POSTHOOK: Lineage: t_test.cint2 SCRIPT [] +POSTHOOK: Lineage: t_test.cvarchar SCRIPT [] +PREHOOK: query: EXPLAIN VECTORIZATION DETAIL +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL +SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -#### A masked pattern was here #### -0 0 -10 10 -100 200 -103 206 -104 208 -PREHOOK: query: explain vectorization detail -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 -PREHOOK: type: QUERY -PREHOOK: Input: default@src -#### A masked pattern was here #### -POSTHOOK: query: explain vectorization detail -SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### PLAN VECTORIZATION: enabled: true @@ -194,42 +76,42 @@ STAGE PLANS: Map Reduce Map Operator Tree: TableScan - alias: src - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + alias: t_test + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE TableScan Vectorization: native: true - vectorizationSchemaColumns: [0:key:string, 1:value:string, 2:ROW__ID:struct] + vectorizationSchemaColumns: [0:cint1:int, 1:cint2:int, 2:cdouble:double, 3:cvarchar:varchar(50), 4:cdecimal1:decimal(10,2)/DECIMAL_64, 5:cdecimal2:decimal(38,5), 6:ROW__ID:struct] Select Operator - expressions: key (type: string) - outputColumnNames: key + expressions: cint1 (type: int) + outputColumnNames: cint1 Select Vectorization: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 14 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator Group By Vectorization: className: VectorGroupByOperator groupByMode: HASH - keyExpressions: col 0:string + keyExpressions: col 0:int native: false vectorProcessingMode: HASH projectedOutputColumnNums: [] - keys: key (type: string) + keys: cint1 (type: int) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: _col0 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col0 (type: string) + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: className: VectorReduceSinkOperator native: false nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE TopN Hash Memory Usage: 0.1 Execution mode: vectorized Map Vectorization: @@ -242,9 +124,9 @@ STAGE PLANS: usesVectorUDFAdaptor: false vectorized: true rowBatchContext: - dataColumnCount: 2 + dataColumnCount: 6 includeColumns: [0] - dataColumns: key:string, value:string + dataColumns: cint1:int, cint2:int, cdouble:double, cvarchar:varchar(50), cdecimal1:decimal(10,2)/DECIMAL_64, cdecimal2:decimal(38,5) partitionColumnCount: 0 scratchColumnTypeNames: [] Reduce Vectorization: @@ -253,10 +135,10 @@ STAGE PLANS: enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Reduce Operator Tree: Group By Operator - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: int) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false table: @@ -270,16 +152,16 @@ STAGE PLANS: TableScan TableScan Vectorization: native: true - vectorizationSchemaColumns: [0:_col0:string] + vectorizationSchemaColumns: [0:_col0:int] Reduce Output Operator - key expressions: _col0 (type: string) + key expressions: _col0 (type: int) sort order: + Reduce Sink Vectorization: className: VectorReduceSinkOperator native: false nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE TopN Hash Memory Usage: 0.1 Execution mode: vectorized Map Vectorization: @@ -294,7 +176,7 @@ STAGE PLANS: rowBatchContext: dataColumnCount: 1 includeColumns: [0] - dataColumns: _col0:string + dataColumns: _col0:int partitionColumnCount: 0 scratchColumnTypeNames: [] Reduce Vectorization: @@ -303,15 +185,15 @@ STAGE PLANS: enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: string) + expressions: KEY.reducesinkkey0 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE Limit - Number of rows: 5 - Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + Number of rows: 3 + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -319,176 +201,81 @@ STAGE PLANS: Stage: Stage-0 Fetch Operator - limit: 5 + limit: 3 Processor Tree: ListSink -PREHOOK: query: SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: query: SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: SELECT key FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: query: SELECT cint1 FROM t_test GROUP BY cint1 ORDER BY cint1 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### -0 -10 -100 -103 -104 -PREHOOK: query: explain vectorization detail -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +4 +6 +7 +PREHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1, cint2 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: explain vectorization detail -SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +POSTHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1, cint2 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### -PLAN VECTORIZATION: - enabled: true - enabledConditionsMet: [hive.vectorized.execution.enabled IS true] - -STAGE DEPENDENCIES: - Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - -STAGE PLANS: - Stage: Stage-1 - Map Reduce - Map Operator Tree: - TableScan - alias: src1 - filterExpr: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Filter Operator - predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: key (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE - TableScan - alias: src2 - filterExpr: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Filter Operator - predicate: key is not null (type: boolean) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: key (type: string), value (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: string) - Map Vectorization: - enabled: false - enabledConditionsNotMet: Vectorized map work only works with 1 TableScanOperator IS false - Reduce Vectorization: - enabled: false - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true - enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Reduce Operator Tree: - Join Operator - condition map: - Inner Join 0 to 1 - keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - outputColumnNames: _col0, _col2 - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: _col0 (type: string), _col2 (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - TableScan Vectorization: - native: true - vectorizationSchemaColumns: [0:_col0:string, 1:_col1:string] - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Reduce Sink Vectorization: - className: VectorReduceSinkOperator - native: false - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col1 (type: string) - Execution mode: vectorized - Map Vectorization: - enabled: true - enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true - inputFormatFeatureSupport: [] - featureSupportInUse: [] - inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat - allNative: false - usesVectorUDFAdaptor: false - vectorized: true - rowBatchContext: - dataColumnCount: 2 - includeColumns: [0, 1] - dataColumns: _col0:string, _col1:string - partitionColumnCount: 0 - scratchColumnTypeNames: [] - Reduce Vectorization: - enabled: false - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true - enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 791 Data size: 140798 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: 5 - Processor Tree: - ListSink - -PREHOOK: query: SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +4 1 +4 2 +6 2 +PREHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1 DESC, cint2 LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT cint1, cint2 FROM t_test GROUP BY cint1, cint2 ORDER BY cint1 DESC, cint2 LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +8 9 +7 8 +6 2 +PREHOOK: query: SELECT cint1, cdouble FROM t_test GROUP BY cint1, cdouble ORDER BY cint1, cdouble LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT cint1, cdouble FROM t_test GROUP BY cint1, cdouble ORDER BY cint1, cdouble LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +4 2.0 +4 3.3 +6 1.8 +PREHOOK: query: SELECT cvarchar, cdouble FROM t_test GROUP BY cvarchar, cdouble ORDER BY cvarchar, cdouble LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT cvarchar, cdouble FROM t_test GROUP BY cvarchar, cdouble ORDER BY cvarchar, cdouble LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +five 2.0 +four 4.5 +one 2.0 +PREHOOK: query: SELECT cdecimal1, cdecimal2 FROM t_test GROUP BY cdecimal1, cdecimal2 ORDER BY cdecimal1, cdecimal2 LIMIT 3 PREHOOK: type: QUERY -PREHOOK: Input: default@src +PREHOOK: Input: default@t_test #### A masked pattern was here #### -POSTHOOK: query: SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5 +POSTHOOK: query: SELECT cdecimal1, cdecimal2 FROM t_test GROUP BY cdecimal1, cdecimal2 ORDER BY cdecimal1, cdecimal2 LIMIT 3 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src +POSTHOOK: Input: default@t_test #### A masked pattern was here #### -0 val_0 -0 val_0 -0 val_0 -0 val_0 -0 val_0 +1.80 1.80000 +2.00 2.00000 +3.30 3.30000 +PREHOOK: query: DROP TABLE t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectComparator.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectComparator.java new file mode 100644 index 0000000000..9fb7787118 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectComparator.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.objectinspector; + +import java.util.Comparator; + +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.NullValueOption; + +/** + * This class wraps the ObjectInspectorUtils.compare method and implements java.util.Comparator. + */ +public class ObjectComparator implements Comparator { + + private final ObjectInspector objectInspector1; + private final ObjectInspector objectInspector2; + private final NullValueOption nullSortOrder; + private final MapEqualComparer mapEqualComparer = new FullMapEqualComparer(); + + public ObjectComparator(ObjectInspector objectInspector1, ObjectInspector objectInspector2, + NullValueOption nullSortOrder) { + this.objectInspector1 = objectInspector1; + this.objectInspector2 = objectInspector2; + this.nullSortOrder = nullSortOrder; + } + + @Override + public int compare(Object o1, Object o2) { + return ObjectInspectorUtils.compare(o1, objectInspector1, o2, objectInspector2, mapEqualComparer, nullSortOrder); + } +}