diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 711dfbdc1f..ac96e96c5d 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1692,6 +1692,9 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "If the skew information is correctly stored in the metadata, hive.optimize.skewjoin.compiletime\n" + "would change the query plan to take care of it, and hive.optimize.skewjoin will be a no-op."), + // TODO: Set default value as false + HIVE_OPTIMIZE_TOPNKEY("hive.optimize.topnkey", true, "Whether to enable top n key optimizer."), + HIVE_SHARED_WORK_OPTIMIZATION("hive.optimize.shared.work", true, "Whether to enable shared work optimizer. The optimizer finds scan operator over the same table\n" + "and follow-up operators in the query plan and merges them if they meet some preconditions. Tez only."), diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index 2bf64dcfbd..241c7b81ec 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -281,6 +281,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ tez_union_multiinsert.q,\ tez_vector_dynpart_hashjoin_1.q,\ tez_vector_dynpart_hashjoin_2.q,\ + topn_key.q,\ union2.q,\ union3.q,\ union4.q,\ @@ -385,6 +386,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ vector_row__id.q,\ vector_string_concat.q,\ vector_struct_in.q,\ + vector_topn_key.q,\ vector_udf_character_length.q,\ vector_udf_octet_length.q,\ vector_varchar_4.q,\ diff --git ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java index a002348013..f8328be25c 100644 --- ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java +++ ql/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/ql/plan/api/OperatorType.java @@ -37,7 +37,8 @@ ORCFILEMERGE(22), RCFILEMERGE(23), MERGEJOIN(24), - SPARKPRUNINGSINK(25); + SPARKPRUNINGSINK(25), + TOPNKEY(26); private final int value; @@ -110,6 +111,8 @@ public static OperatorType findByValue(int value) { return MERGEJOIN; case 25: return SPARKPRUNINGSINK; + case 26: + return TOPNKEY; default: return null; } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java index 775c737bba..d233fb077f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java @@ -158,13 +158,13 @@ private void deepCopyElements(Object[] keys, } } - transient Object[] singleEleArray = new Object[1]; transient StringObjectInspector soi_new, soi_copy; class TextKeyWrapper extends KeyWrapper { int hashcode; Object key; boolean isCopy; + transient Object[] singleEleArray = new Object[1]; public TextKeyWrapper(boolean isCopy) { this(-1, null, isCopy); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java index e665064133..4894e70cef 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorSparkHashTableSinkOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorSparkPartitionPruningSinkOperator; +import org.apache.hadoop.hive.ql.exec.vector.VectorTopNKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkCommonOperator; import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFOperator; @@ -76,6 +77,7 @@ import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.SparkHashTableSinkDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; import org.apache.hadoop.hive.ql.plan.UDTFDesc; import org.apache.hadoop.hive.ql.plan.UnionDesc; import org.apache.hadoop.hive.ql.plan.VectorDesc; @@ -126,6 +128,7 @@ opvec.put(OrcFileMergeDesc.class, OrcFileMergeOperator.class); opvec.put(CommonMergeJoinDesc.class, CommonMergeJoinOperator.class); opvec.put(ListSinkDesc.class, ListSinkOperator.class); + opvec.put(TopNKeyDesc.class, TopNKeyOperator.class); } static { @@ -143,6 +146,7 @@ vectorOpvec.put(LimitDesc.class, VectorLimitOperator.class); vectorOpvec.put(PTFDesc.class, VectorPTFOperator.class); vectorOpvec.put(SparkHashTableSinkDesc.class, VectorSparkHashTableSinkOperator.class); + vectorOpvec.put(TopNKeyDesc.class, VectorTopNKeyOperator.class); } public static Operator getVectorOperator( diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java new file mode 100644 index 0000000000..b55acaf731 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java @@ -0,0 +1,177 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.PriorityQueue; + +import static org.apache.hadoop.hive.ql.plan.api.OperatorType.TOPNKEY; + +public class TopNKeyOperator extends Operator implements Serializable { + + private static final long serialVersionUID = 1L; + + private transient KeyWrapper keyWrapper; + private transient KeyWrapper standardKeyWrapper; + private transient int topN; + private transient PriorityQueue priorityQueue; + + private transient int rowLimit; + private transient int rowSize; + private transient Object[] rows; + private transient ObjectInspector standardObjectInspector; + + public TopNKeyOperator() { + super(); + } + + public TopNKeyOperator(CompilationOpContext ctx) { + super(ctx); + } + + public static class KeyWrapperComparator implements Comparator { + private ObjectInspector[] objectInspectors1; + private ObjectInspector[] objectInspectors2; + private boolean[] columnSortOrderIsDesc; + + public KeyWrapperComparator(ObjectInspector[] objectInspectors1, ObjectInspector[] + objectInspectors2, boolean[] columnSortOrderIsDesc) { + this.objectInspectors1 = objectInspectors1; + this.objectInspectors2 = objectInspectors2; + this.columnSortOrderIsDesc = columnSortOrderIsDesc; + } + + @Override + public int compare(KeyWrapper key1, KeyWrapper key2) { + return ObjectInspectorUtils.compare(key1.getKeyArray(), objectInspectors1, + key2.getKeyArray(), objectInspectors2, columnSortOrderIsDesc); + } + } + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + super.initializeOp(hconf); + + this.topN = conf.getTopN(); + + String columnSortOrder = conf.getColumnSortOrder(); + boolean[] columnSortOrderIsDesc = new boolean[columnSortOrder.length()]; + for (int i = 0; i < columnSortOrderIsDesc.length; i++) { + columnSortOrderIsDesc[i] = (columnSortOrder.charAt(i) == '-'); + } + + ObjectInspector rowInspector = inputObjInspectors[0]; + standardObjectInspector = ObjectInspectorUtils.getStandardObjectInspector(rowInspector); + outputObjInspector = standardObjectInspector; + + // init keyFields + int numKeys = conf.getKeys().size(); + ExprNodeEvaluator[] keyFields = new ExprNodeEvaluator[numKeys]; + ObjectInspector[] keyObjectInspectors = new ObjectInspector[numKeys]; + ExprNodeEvaluator[] standardKeyFields = new ExprNodeEvaluator[numKeys]; + ObjectInspector[] standardKeyObjectInspectors = new ObjectInspector[numKeys]; + + for (int i = 0; i < numKeys; i++) { + ExprNodeDesc key = conf.getKeys().get(i); + keyFields[i] = ExprNodeEvaluatorFactory.get(key, hconf); + keyObjectInspectors[i] = keyFields[i].initialize(rowInspector); + standardKeyFields[i] = ExprNodeEvaluatorFactory.get(key, hconf); + standardKeyObjectInspectors[i] = standardKeyFields[i].initialize(standardObjectInspector); + } + + priorityQueue = new PriorityQueue<>(topN + 1, new TopNKeyOperator.KeyWrapperComparator( + standardKeyObjectInspectors, standardKeyObjectInspectors, columnSortOrderIsDesc)); + + keyWrapper = new KeyWrapperFactory(keyFields, keyObjectInspectors, + standardKeyObjectInspectors).getKeyWrapper(); + standardKeyWrapper = new KeyWrapperFactory(standardKeyFields, standardKeyObjectInspectors, + standardKeyObjectInspectors).getKeyWrapper(); + + rowLimit = VectorizedRowBatch.DEFAULT_SIZE; + rows = new Object[rowLimit]; + rowSize = 0; + + } + + @Override + public void process(Object row, int tag) throws HiveException { + keyWrapper.getNewKey(row, inputObjInspectors[0]); + keyWrapper.setHashKey(); + + if (!priorityQueue.contains(keyWrapper)) { + priorityQueue.offer(keyWrapper.copyKey()); + } + if (priorityQueue.size() > topN) { + priorityQueue.poll(); + } + + rows[rowSize] = ObjectInspectorUtils.copyToStandardObject(row, inputObjInspectors[0]); + rowSize++; + + if (rowSize % rowLimit == 0) { + processRows(); + } + } + + private void processRows() throws HiveException { + for (int i = 0; i < rowSize; i++) { + Object row = rows[i]; + + standardKeyWrapper.getNewKey(row, standardObjectInspector); + standardKeyWrapper.setHashKey(); + + if (priorityQueue.contains(standardKeyWrapper)) { + forward(row, standardObjectInspector); + } + } + priorityQueue.clear(); + rowSize = 0; + } + + @Override + protected final void closeOp(boolean abort) throws HiveException { + processRows(); + super.closeOp(abort); + } + + @Override + public String getName() { + return getOperatorName(); + } + + static public String getOperatorName() { + return "TNK"; + } + + @Override + public OperatorType getType() { + return TOPNKEY; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java new file mode 100644 index 0000000000..d7c479b19c --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java @@ -0,0 +1,248 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.primitives.Ints; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.DataTypePhysicalVariation; +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TopNKeyOperator; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; +import org.apache.hadoop.hive.ql.plan.VectorDesc; +import org.apache.hadoop.hive.ql.plan.VectorTopNKeyDesc; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Properties; + +import static org.apache.hadoop.hive.ql.plan.api.OperatorType.TOPNKEY; + +public class VectorTopNKeyOperator extends Operator + implements VectorizationOperator, VectorizationContextRegion { + + private static final long serialVersionUID = 1L; + + private VectorTopNKeyDesc vectorTopNKeyDesc; + private VectorizationContext vContext; + private VectorizationContext vOutContext; + private int[] keyCols; + private TypeInfo[] keyTypeInfos; + + private transient boolean firstBatch; + private transient Object[] singleRow; + private transient VectorExtractRow vectorExtractRow; + private transient PriorityQueue priorityQueue; + private transient BinarySortableSerDe binarySortableSerDe; + private transient StructObjectInspector keyObjectInspector; + + public VectorTopNKeyOperator(CompilationOpContext ctx, OperatorDesc conf, + VectorizationContext vContext, VectorDesc vectorDesc) { + this(ctx); + this.conf = (TopNKeyDesc) conf; + this.vContext = vContext; + this.vectorTopNKeyDesc = (VectorTopNKeyDesc) vectorDesc; + + VectorExpression[] keyExpressions = vectorTopNKeyDesc.getKeyExpressions(); + final int size = keyExpressions.length; + + this.keyCols = new int[size]; + this.keyTypeInfos = new TypeInfo[size]; + DataTypePhysicalVariation[] keyDataTypePhysicalVariations = new DataTypePhysicalVariation[size]; + + for (int i = 0; i < size; i++) { + VectorExpression keyExpression = keyExpressions[i]; + this.keyCols[i] = keyExpression.getOutputColumnNum(); + this.keyTypeInfos[i] = keyExpression.getOutputTypeInfo(); + keyDataTypePhysicalVariations[i] = keyExpression.getOutputDataTypePhysicalVariation(); + } + + vOutContext = new VectorizationContext(getName(), ((TopNKeyDesc) conf).getKeyColumnNames(), + /* vContextEnvironment */ vContext); + vOutContext.setInitialTypeInfos(Arrays.asList(keyTypeInfos)); + vOutContext.setInitialDataTypePhysicalVariations(Arrays.asList(keyDataTypePhysicalVariations)); + } + + /** Kryo ctor. */ + @VisibleForTesting + public VectorTopNKeyOperator() { + super(); + } + + public VectorTopNKeyOperator(CompilationOpContext ctx) { + super(ctx); + } + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + super.initializeOp(hconf); + + this.firstBatch = true; + + VectorExpression[] keyExpressions = vectorTopNKeyDesc.getKeyExpressions(); + final int size = keyExpressions.length; + ObjectInspector[] fieldObjectInspectors = new ObjectInspector[size]; + + for (int i = 0; i < size; i++) { + VectorExpression keyExpression = keyExpressions[i]; + fieldObjectInspectors[i] = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo( + keyExpression.getOutputTypeInfo()); + } + + keyObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector( + this.conf.getKeyColumnNames(), Arrays.asList(fieldObjectInspectors)); + } + + @Override + public void process(Object data, int tag) throws HiveException { + VectorizedRowBatch batch = (VectorizedRowBatch) data; + + if (firstBatch) { + vectorExtractRow = new VectorExtractRow(); + vectorExtractRow.init(keyObjectInspector, Ints.asList(keyCols)); + + singleRow = new Object[vectorExtractRow.getCount()]; + + priorityQueue = new PriorityQueue<>(); + + try { + binarySortableSerDe = new BinarySortableSerDe(); + Properties properties = new Properties(); + properties.setProperty(serdeConstants.LIST_COLUMNS, + Joiner.on(',').join(conf.getKeyColumnNames())); + properties.setProperty(serdeConstants.LIST_COLUMN_TYPES, + Joiner.on(',').join(keyTypeInfos)); + properties.setProperty(serdeConstants.SERIALIZATION_SORT_ORDER, + conf.getColumnSortOrder()); + binarySortableSerDe.initialize(getConfiguration(), properties); + } catch (SerDeException e) { + throw new HiveException(e); + } + + firstBatch = false; + } + + // Clear the priority queue + priorityQueue.clear(); + + // Get top n keys + for (int i = 0; i < batch.size; i++) { + + // Get keys + if (batch.selectedInUse) { + vectorExtractRow.extractRow(batch, batch.selected[i], singleRow); + } else { + vectorExtractRow.extractRow(batch, i, singleRow); + } + + Writable keysWritable; + try { + keysWritable = binarySortableSerDe.serialize(singleRow, keyObjectInspector); + } catch (SerDeException e) { + throw new HiveException(e); + } + + // Put the copied keys into the priority queue + if (!priorityQueue.contains(keysWritable)) { + priorityQueue.offer(WritableUtils.clone(keysWritable, getConfiguration())); + } + + // Limit the queue size + if (priorityQueue.size() > conf.getTopN()) { + priorityQueue.poll(); + } + } + + // Filter rows with top n keys + int size = 0; + int[] selected = new int[batch.selected.length]; + for (int i = 0; i < batch.size; i++) { + // Get keys + if (batch.selectedInUse) { + vectorExtractRow.extractRow(batch, batch.selected[i], singleRow); + } else { + vectorExtractRow.extractRow(batch, i, singleRow); + } + + Writable keysWritable; + try { + keysWritable = binarySortableSerDe.serialize(singleRow, keyObjectInspector); + } catch (SerDeException e) { + throw new HiveException(e); + } + + // Select a row in the priority queue + if (priorityQueue.contains(keysWritable)) { + selected[size++] = i; + } + } + + // Apply selection to batch + if (batch.size != size) { + batch.selectedInUse = true; + batch.selected = selected; + batch.size = size; + } + + // Forward the result + forward(batch, null, true); + } + + @Override + public String getName() { + return TopNKeyOperator.getOperatorName(); + } + + @Override + public OperatorType getType() { + return TOPNKEY; + } + + @Override + public VectorizationContext getInputVectorizationContext() { + return vContext; + } + + @Override + public VectorizationContext getOutputVectorizationContext() { + return vOutContext; + } + + @Override + public VectorDesc getVectorDesc() { + return vectorTopNKeyDesc; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java new file mode 100644 index 0000000000..4698df6cd4 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; + +import java.util.List; +import java.util.Stack; + +public class TopNKeyProcessor implements NodeProcessor { + + public TopNKeyProcessor() { + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + // Get RS + ReduceSinkOperator reduceSinkOperator = (ReduceSinkOperator) nd; + ReduceSinkDesc reduceSinkDesc = reduceSinkOperator.getConf(); + + // It only allows an ordered and limited reduce sink + if (!reduceSinkDesc.isOrdering() || reduceSinkDesc.getTopN() < 0) { + return null; + } + + List keyCols = reduceSinkDesc.getKeyCols(); + RowSchema rowSchema = reduceSinkOperator.getSchema(); + String order = reduceSinkDesc.getOrder(); + + int depth = stack.size() - 1; + + Operator parentOperator = + (Operator) stack.get(depth - 1); + Operator currentOperator = + (Operator) stack.get(depth); + + // Insert top n key operator between the current operator and the parent operator + TopNKeyDesc topNKeyDesc = new TopNKeyDesc(reduceSinkDesc.getTopN(), keyCols, order); + makeOpBetween(topNKeyDesc, new RowSchema(rowSchema), parentOperator, currentOperator); + + return null; + } + + private static void makeOpBetween(OperatorDesc newOperatorDesc, RowSchema rowSchema, + Operator parentOperator, + Operator currentOperator) { + + // PARENT -> (NEW, (CURRENT -> CHILD)) + Operator newOperator = OperatorFactory.getAndMakeChild( + currentOperator.getCompilationOpContext(), newOperatorDesc, rowSchema, + currentOperator.getParentOperators()); + + // PARENT -> NEW -> CURRENT -> CHILD + newOperator.getChildOperators().add(currentOperator); + currentOperator.getParentOperators().add(newOperator); + parentOperator.removeChild(currentOperator); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyPushdown.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyPushdown.java new file mode 100644 index 0000000000..d452fb7b91 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyPushdown.java @@ -0,0 +1,239 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TopNKeyOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import static org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper.transform; + +public class TopNKeyPushdown implements NodeProcessor { + + private static final Logger LOG = LoggerFactory.getLogger(TopNKeyPushdown.class); + + private TopNKeyOperator topNKeyOperator; + private TopNKeyDesc topNKeyDesc; + + public TopNKeyPushdown() { + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + topNKeyOperator = (TopNKeyOperator) nd; + topNKeyDesc = topNKeyOperator.getConf(); + + int depth = stack.size() - 1; + int delta; + while ((delta = checkAndPushdown(stack, depth)) > 0) { + depth -= delta; + } + + return null; + } + + private static List mapKeyCols(Map mapping, + List keyCols) { + List keyColsInOperator = new ArrayList<>(); + for (ExprNodeDesc keyCol : keyCols) { + String keyExprString = keyCol.getExprString(); + if (mapping.containsKey(keyExprString)) { + keyColsInOperator.add(mapping.get(keyExprString)); + } + } + return keyColsInOperator; + } + + private int checkAndPushdown(Stack stack, int depth) { + + Operator parentOperator = + (Operator) stack.get(depth - 1); + + LOG.debug("Check " + parentOperator); + + if (parentOperator instanceof GroupByOperator) { + GroupByOperator groupByOperator = (GroupByOperator) parentOperator; + GroupByDesc groupByDesc = groupByOperator.getConf(); + + List mappedKeyCols = mapKeyCols(groupByDesc.getColumnExprMap(), + topNKeyDesc.getKeys()); + if (!transform(groupByDesc.getKeys()).containsAll(transform(mappedKeyCols))) { + return 0; + } + + topNKeyDesc.setKeyColumns(mappedKeyCols); + topNKeyDesc.setColumnSortOrder(mapOrder(topNKeyDesc.getColumnSortOrder(), + groupByDesc.getKeys(), mappedKeyCols)); + topNKeyOperator.setSchema(new RowSchema(groupByOperator.getSchema())); + + moveDown(parentOperator); + return 1; + } else if (parentOperator instanceof SelectOperator) { + SelectOperator selectOperator = (SelectOperator) parentOperator; + SelectDesc selectDesc = selectOperator.getConf(); + + List mappedKeyCols = mapKeyCols(selectDesc.getColumnExprMap(), + topNKeyDesc.getKeys()); + if (!transform(selectDesc.getColList()).containsAll(transform(mappedKeyCols))) { + return 0; + } + + topNKeyDesc.setKeyColumns(mappedKeyCols); + topNKeyDesc.setColumnSortOrder(mapOrder(topNKeyDesc.getColumnSortOrder(), + selectDesc.getColList(), mappedKeyCols)); + topNKeyOperator.setSchema(new RowSchema(selectOperator.getSchema())); + + moveDown(parentOperator); + return 1; + } else if (parentOperator instanceof FilterOperator) { + moveDown(parentOperator); + return 1; + } else if (parentOperator instanceof CommonJoinOperator) { + ReduceSinkOperator reduceSinkOperator = (ReduceSinkOperator) stack.get(depth - 2); + ReduceSinkDesc reduceSinkDesc = reduceSinkOperator.getConf(); + + CommonJoinOperator joinOperator = (CommonJoinOperator) parentOperator; + JoinDesc joinDesc = (JoinDesc) joinOperator.getConf(); + + List mappedKeyCols = mapKeyCols(reduceSinkDesc.getColumnExprMap(), + mapKeyCols(joinDesc.getColumnExprMap(), topNKeyDesc.getKeys())); + + // If there's no mapped key cols, skip + if (mappedKeyCols.isEmpty()) { + return 0; + } + + TopNKeyDesc newTopNKeyDesc = new TopNKeyDesc(); + newTopNKeyDesc.setColumnSortOrder(mapOrder(topNKeyDesc.getColumnSortOrder(), + reduceSinkDesc.getKeyCols(), mappedKeyCols)); + newTopNKeyDesc.setKeyColumns(mappedKeyCols); + newTopNKeyDesc.setTopN(topNKeyDesc.getTopN()); + + // Avoid duplicates + Operator grandParentOperator = + (Operator) stack.get(depth - 3); + if (grandParentOperator instanceof TopNKeyOperator) { + TopNKeyOperator grandParentTnkOp = (TopNKeyOperator) grandParentOperator; + TopNKeyDesc grandParentTnkDesc = grandParentTnkOp.getConf(); + if (grandParentTnkDesc.isSame(newTopNKeyDesc)) { + return 0; + } + } + + topNKeyOperator = copyDown(newTopNKeyDesc, reduceSinkOperator); + topNKeyDesc = topNKeyOperator.getConf(); + return 2; + } + return 0; + } + + private void moveDown(Operator baseOperator) { + LOG.debug("Move down through " + baseOperator); + + // Before: PARENT ---> BASE ---> NEW ---> CHILD + // After: PARENT -1-> NEW -2-> BASE -3-> CHILD + + List> parentOperators = baseOperator.getParentOperators(); + List> childOperators = topNKeyOperator.getChildOperators(); + + // PARENT -1-> TNK + for (Operator parentOperator : parentOperators) { + parentOperator.replaceChild(baseOperator, topNKeyOperator); + } + topNKeyOperator.setParentOperators(new ArrayList<>(parentOperators)); + + // TNK -2-> BASE + topNKeyOperator.setChildOperators(new ArrayList<>(Collections.singletonList(baseOperator))); + baseOperator.setParentOperators(new ArrayList<>(Collections.singletonList(topNKeyOperator))); + + // BASE -3-> CHILD + baseOperator.setChildOperators(new ArrayList<>(childOperators)); + for (Operator childOperator : childOperators) { + childOperator.replaceParent(topNKeyOperator, baseOperator); + } + } + + private TopNKeyOperator copyDown(TopNKeyDesc newDesc, Operator + baseOperator) { + + List> parentOperators = baseOperator.getParentOperators(); + + LOG.debug("New top n key operator between " + baseOperator + " and " + parentOperators); + + // Before: PARENT ------------> BASE + // After: PARENT -1-> NEW -2-> BASE + + Operator newOperator = + OperatorFactory.get(baseOperator.getCompilationOpContext(), newDesc, + baseOperator.getSchema()); + + // PARENT -1-> NEW + for (Operator parentOperator : parentOperators) { + parentOperator.replaceChild(baseOperator, newOperator); + } + newOperator.setParentOperators(new ArrayList<>(parentOperators)); + + // NEW -2-> BASE + newOperator.setChildOperators(new ArrayList<>(Collections.singletonList(baseOperator))); + baseOperator.setParentOperators(new ArrayList<>(Collections.singletonList(newOperator))); + + return (TopNKeyOperator) newOperator; + } + + private static String mapOrder(String order, List parentCols, List + mappedCols) { + + StringBuilder builder = new StringBuilder(); + int index = 0; + for (ExprNodeDesc mappedCol : mappedCols) { + if (parentCols.contains(mappedCol)) { + builder.append(order.charAt(index++)); + } else { + builder.append("+"); + } + } + return builder.toString(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 8ce2c33d2e..0daffc1302 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -41,6 +41,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.calcite.util.Pair; import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; @@ -149,6 +150,7 @@ import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc.ProcessingMode; import org.apache.hadoop.hive.ql.plan.VectorSparkHashTableSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorSparkPartitionPruningSinkDesc; +import org.apache.hadoop.hive.ql.plan.VectorTopNKeyDesc; import org.apache.hadoop.hive.ql.plan.VectorLimitDesc; import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo; import org.apache.hadoop.hive.ql.plan.VectorSMBJoinDesc; @@ -4034,6 +4036,20 @@ private boolean usesVectorUDFAdaptor(VectorExpression[] vecExprs) { vContext, vectorSelectDesc); } + private static Operator vectorizeTopNKeyOperator( + Operator topNKeyOperator, VectorizationContext vContext, + VectorTopNKeyDesc vectorTopNKeyDesc) throws HiveException { + + TopNKeyDesc topNKeyDesc = (TopNKeyDesc) topNKeyOperator.getConf(); + + vectorTopNKeyDesc.setKeyExpressions(vContext.getVectorExpressions( + topNKeyDesc.getKeys())); + + return OperatorFactory.getVectorOperator( + topNKeyOperator.getCompilationOpContext(), topNKeyDesc, + vContext, vectorTopNKeyDesc); + } + private static void fillInPTFEvaluators( List windowsFunctions, String[] evaluatorFunctionNames, @@ -4591,6 +4607,13 @@ private static VectorPTFInfo createVectorPTFInfo(Operator inputs, // Create the context for the walker OptimizeTezProcContext procCtx = new OptimizeTezProcContext(conf, pCtx, inputs, outputs); + if (procCtx.conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_TOPNKEY)) { + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + runTopNKeyOptimization(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run top n key optimization"); + + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + runTopNKeyPushdown(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run top n key pushdown"); + } + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); // setup dynamic partition pruning where possible runDynamicPartitionPruning(procCtx, inputs, outputs); @@ -996,6 +1010,37 @@ private void removeSemiJoinIfNoStats(OptimizeTezProcContext procCtx) ogw.startWalking(topNodes, null); } + private static void runTopNKeyOptimization(OptimizeTezProcContext procCtx) + throws SemanticException { + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("Top n key optimization", ReduceSinkOperator.getOperatorName() + "%"), + new TopNKeyProcessor()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new DefaultGraphWalker(disp); + ogw.startWalking(topNodes, null); + } + + private static void runTopNKeyPushdown(OptimizeTezProcContext procCtx) throws SemanticException { + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("Top n key pushdown", TopNKeyOperator.getOperatorName() + "%"), + new TopNKeyPushdown()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } + private boolean findParallelSemiJoinBranch(Operator mapjoin, TableScanOperator bigTableTS, ParseContext parseContext, Map semijoins) { diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java new file mode 100644 index 0000000000..0d83a79400 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.hive.ql.plan.Explain.Level; + +import java.util.ArrayList; +import java.util.List; + +/** + * TopNKeyDesc. + * + */ +@Explain(displayName = "Top N Key Operator", + explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) + +public class TopNKeyDesc extends AbstractOperatorDesc { + private static final long serialVersionUID = 1L; + + private int topN; + private List keyColumns; + private String columnSortOrder; + + public TopNKeyDesc() { + } + + public TopNKeyDesc( + final int topN, + final List keyColumns, + final String columnSortOrder) { + this.topN = topN; + this.keyColumns = keyColumns; + this.columnSortOrder = columnSortOrder; + } + + @Explain(displayName = "keys") + public String getKeyString() { + return PlanUtils.getExprListString(keyColumns); + } + + @Explain(displayName = "keys", explainLevels = { Level.USER }) + public String getUserLevelExplainKeyString() { + return PlanUtils.getExprListString(keyColumns, true); + } + + public List getKeys() { + return keyColumns; + } + + public void setKeyColumns(final List keyColumns) { + this.keyColumns = keyColumns; + } + + @Explain(displayName = "top n", explainLevels = { Level.DEFAULT, Level.EXTENDED, Level.USER }) + public int getTopN() { + return topN; + } + + public void setTopN(int topN) { + this.topN = topN; + } + + @Explain(displayName = "order", explainLevels = { Level.DEFAULT, Level.EXTENDED, Level.USER }) + public String getColumnSortOrder() { + return columnSortOrder; + } + + public void setColumnSortOrder(String columnSortOrder) { + this.columnSortOrder = columnSortOrder; + } + + @Override + public boolean isSame(OperatorDesc other) { + if (getClass().getName().equals(other.getClass().getName())) { + TopNKeyDesc otherDesc = (TopNKeyDesc) other; + return getTopN() == otherDesc.getTopN() && + columnSortOrder.equals(otherDesc.columnSortOrder) && + keyColumns.equals(otherDesc.keyColumns); + } + return false; + } + + @Explain(displayName = "key column names", explainLevels = { Level.DEFAULT, Level.EXTENDED, Level.USER }) + public List getKeyColumnNames() { + List names = new ArrayList<>(); + for (ExprNodeDesc col : keyColumns) { + names.add(col.getExprString()); + } + return names; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorTopNKeyDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorTopNKeyDesc.java new file mode 100644 index 0000000000..23cb1cc6ad --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorTopNKeyDesc.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; + +public class VectorTopNKeyDesc extends AbstractVectorDesc { + private static final long serialVersionUID = 1L; + + private VectorExpression[] keyExpressions; + + public VectorTopNKeyDesc() { + } + + public void setKeyExpressions(VectorExpression[] keyExpressions) { + this.keyExpressions = keyExpressions; + } + + public VectorExpression[] getKeyExpressions() { + return keyExpressions; + } +} diff --git ql/src/test/queries/clientpositive/topn_key.q ql/src/test/queries/clientpositive/topn_key.q new file mode 100644 index 0000000000..28a755a538 --- /dev/null +++ ql/src/test/queries/clientpositive/topn_key.q @@ -0,0 +1,21 @@ +set hive.mapred.mode=nonstrict; +set hive.optimize.topnkey=true; + +EXPLAIN +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5; + +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5; + +EXPLAIN +SELECT src.key FROM src GROUP BY src.key LIMIT 5; + +SELECT src.key FROM src GROUP BY src.key LIMIT 5; + +EXPLAIN +SELECT src1.key AS k, SUM(SUBSTR(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5; + +SELECT src1.key AS k, sum(substr(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5; diff --git ql/src/test/queries/clientpositive/vector_topn_key.q ql/src/test/queries/clientpositive/vector_topn_key.q new file mode 100644 index 0000000000..617a99bde4 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_topn_key.q @@ -0,0 +1,28 @@ +set hive.mapred.mode=nonstrict; +set hive.vectorized.execution.enabled=true; +set hive.optimize.topnkey=true; + +explain vectorization +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5; + +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5; + +explain vectorization +SELECT key FROM src GROUP BY key LIMIT 5; + +SELECT key FROM src GROUP BY key LIMIT 5; + +EXPLAIN VECTORIZATION +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2; + +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2; diff --git ql/src/test/results/clientpositive/llap/topn_key.q.out ql/src/test/results/clientpositive/llap/topn_key.q.out new file mode 100644 index 0000000000..8f7f984ccc --- /dev/null +++ ql/src/test/results/clientpositive/llap/topn_key.q.out @@ -0,0 +1,362 @@ +PREHOOK: query: EXPLAIN +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + order: + + key column names: key + keys: key (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Select Operator + expressions: key (type: string), substr(value, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Top N Key Operator + order: + + key column names: KEY._col0 + keys: KEY._col0 (type: string) + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 200.0 +103 206.0 +104 208.0 +PREHOOK: query: EXPLAIN +SELECT src.key FROM src GROUP BY src.key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT src.key FROM src GROUP BY src.key LIMIT 5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + order: + + key column names: key + keys: key (type: string) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: key (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT src.key FROM src GROUP BY src.key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src.key FROM src GROUP BY src.key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +10 +100 +104 +103 +0 +PREHOOK: query: EXPLAIN +SELECT src1.key AS k, SUM(SUBSTR(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT src1.key AS k, SUM(SUBSTR(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + order: + + key column names: key + keys: key (type: string) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Map 5 + Map Operator Tree: + TableScan + alias: src2 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + order: + + key column names: key + keys: key (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 809 Data size: 144002 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + order: + + key column names: _col0 + keys: _col0 (type: string) + Statistics: Num rows: 809 Data size: 144002 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Select Operator + expressions: _col0 (type: string), substr(_col2, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 809 Data size: 144002 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 309 Data size: 29355 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 309 Data size: 29355 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Top N Key Operator + order: + + key column names: KEY._col0 + keys: KEY._col0 (type: string) + Statistics: Num rows: 309 Data size: 29355 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 309 Data size: 29355 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 309 Data size: 29355 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 309 Data size: 29355 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT src1.key AS k, sum(substr(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src1.key AS k, sum(substr(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 400.0 +103 412.0 +104 416.0 diff --git ql/src/test/results/clientpositive/llap/vector_topn_key.q.out ql/src/test/results/clientpositive/llap/vector_topn_key.q.out new file mode 100644 index 0000000000..93ebedd200 --- /dev/null +++ ql/src/test/results/clientpositive/llap/vector_topn_key.q.out @@ -0,0 +1,483 @@ +PREHOOK: query: explain vectorization +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + order: + + key column names: key + keys: key (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Select Operator + expressions: key (type: string), substr(value, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Execution mode: llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + notVectorizedReason: SELECT operator: The column value is not in the vectorization context column map {key=0}. + vectorized: false + Reducer 2 + Execution mode: llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + notVectorizedReason: GROUPBY operator: The column VALUE._col0 is not in the vectorization context column map {KEY._col0=0}. + vectorized: false + Reduce Operator Tree: + Top N Key Operator + order: + + key column names: KEY._col0 + keys: KEY._col0 (type: string) + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 200.0 +103 206.0 +104 208.0 +PREHOOK: query: explain vectorization +SELECT key FROM src GROUP BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization +SELECT key FROM src GROUP BY key LIMIT 5 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + order: + + key column names: key + keys: key (type: string) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: key (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + vectorizationSupportRemovedReasons: [DECIMAL_64 disabled because LLAP is enabled] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 21750 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key FROM src GROUP BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key FROM src GROUP BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +92 +97 +95 +96 +98 +Warning: Shuffle Join MERGEJOIN[15][tables = [src1, src2]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: EXPLAIN VECTORIZATION +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN VECTORIZATION +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Map 4 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key < 10) (type: boolean) + Statistics: Num rows: 166 Data size: 29548 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 29548 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 166 Data size: 29548 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + vectorizationSupportRemovedReasons: [DECIMAL_64 disabled because LLAP is enabled] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: true + usesVectorUDFAdaptor: true + vectorized: true + Map 4 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key < 10) (type: boolean) + Statistics: Num rows: 166 Data size: 29548 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 29548 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 166 Data size: 29548 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + vectorizationSupportRemovedReasons: [DECIMAL_64 disabled because LLAP is enabled] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: true + usesVectorUDFAdaptor: true + vectorized: true + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + sort order: ++++ + Statistics: Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE + Reducer 3 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[15][tables = [src1, src2]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 2 val_2 +0 val_0 2 val_2 +0 val_0 2 val_2 +0 val_0 4 val_4 +0 val_0 4 val_4 +0 val_0 4 val_4 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 8 val_8 +0 val_0 8 val_8 +0 val_0 8 val_8 +0 val_0 9 val_9 +0 val_0 9 val_9 +0 val_0 9 val_9 +2 val_2 0 val_0 +2 val_2 0 val_0 +2 val_2 0 val_0 +2 val_2 2 val_2 +2 val_2 4 val_4 +2 val_2 5 val_5 +2 val_2 5 val_5 +2 val_2 5 val_5 +2 val_2 8 val_8 +2 val_2 9 val_9 +4 val_4 0 val_0 +4 val_4 0 val_0 +4 val_4 0 val_0 +4 val_4 2 val_2 +4 val_4 4 val_4 +4 val_4 5 val_5 +4 val_4 5 val_5 +4 val_4 5 val_5 +4 val_4 8 val_8 +4 val_4 9 val_9 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 2 val_2 +5 val_5 2 val_2 +5 val_5 2 val_2 +5 val_5 4 val_4 +5 val_5 4 val_4 +5 val_5 4 val_4 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 8 val_8 +5 val_5 8 val_8 +5 val_5 8 val_8 +5 val_5 9 val_9 +5 val_5 9 val_9 +5 val_5 9 val_9 +8 val_8 0 val_0 +8 val_8 0 val_0 +8 val_8 0 val_0 +8 val_8 2 val_2 +8 val_8 4 val_4 +8 val_8 5 val_5 +8 val_8 5 val_5 +8 val_8 5 val_5 +8 val_8 8 val_8 +8 val_8 9 val_9 +9 val_9 0 val_0 +9 val_9 0 val_0 +9 val_9 0 val_0 +9 val_9 2 val_2 +9 val_9 4 val_4 +9 val_9 5 val_5 +9 val_9 5 val_5 +9 val_9 5 val_5 +9 val_9 8 val_8 +9 val_9 9 val_9 diff --git ql/src/test/results/clientpositive/perf/tez/query27.q.out ql/src/test/results/clientpositive/perf/tez/query27.q.out index c6a190553e..109405a345 100644 --- ql/src/test/results/clientpositive/perf/tez/query27.q.out +++ ql/src/test/results/clientpositive/perf/tez/query27.q.out @@ -68,73 +68,83 @@ Stage-0 Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"] Group By Operator [GBY_31] (rows=1264972921 width=88) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["avg(VALUE._col0)","avg(VALUE._col1)","avg(VALUE._col2)","avg(VALUE._col3)"],keys:KEY._col0, KEY._col1, KEY._col2 - <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_30] - PartitionCols:_col0, _col1, _col2 - Group By Operator [GBY_29] (rows=2529945843 width=88) - Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["avg(_col2)","avg(_col3)","avg(_col4)","avg(_col5)"],keys:_col0, _col1, 0 - Select Operator [SEL_27] (rows=843315281 width=88) - Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Merge Join Operator [MERGEJOIN_59] (rows=843315281 width=88) - Conds:RS_24._col1=RS_25._col0(Inner),Output:["_col4","_col5","_col6","_col7","_col15","_col17"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_25] - PartitionCols:_col0 - Select Operator [SEL_14] (rows=462000 width=1436) - Output:["_col0","_col1"] - Filter Operator [FIL_55] (rows=462000 width=1436) - predicate:i_item_sk is not null - TableScan [TS_12] (rows=462000 width=1436) - default@item,item,Tbl:COMPLETE,Col:NONE,Output:["i_item_sk","i_item_id"] - <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_24] - PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_58] (rows=766650239 width=88) - Conds:RS_21._col3=RS_22._col0(Inner),Output:["_col1","_col4","_col5","_col6","_col7","_col15"] - <-Map 10 [SIMPLE_EDGE] - SHUFFLE [RS_22] + Top N Key Operator [TNK_56] (rows=2529945843 width=88) + key column names:["KEY._col0","KEY._col1"],keys:KEY._col0, KEY._col1,order:++,top n:100 + <-Reducer 5 [SIMPLE_EDGE] + SHUFFLE [RS_30] + PartitionCols:_col0, _col1, _col2 + Group By Operator [GBY_29] (rows=2529945843 width=88) + Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["avg(_col2)","avg(_col3)","avg(_col4)","avg(_col5)"],keys:_col0, _col1, 0 + Select Operator [SEL_27] (rows=843315281 width=88) + Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + Top N Key Operator [TNK_57] (rows=843315281 width=88) + key column names:["_col17","_col15"],keys:_col17, _col15,order:++,top n:100 + Merge Join Operator [MERGEJOIN_64] (rows=843315281 width=88) + Conds:RS_24._col1=RS_25._col0(Inner),Output:["_col4","_col5","_col6","_col7","_col15","_col17"] + <-Map 11 [SIMPLE_EDGE] + SHUFFLE [RS_25] PartitionCols:_col0 - Select Operator [SEL_11] (rows=852 width=1910) + Select Operator [SEL_14] (rows=462000 width=1436) Output:["_col0","_col1"] - Filter Operator [FIL_54] (rows=852 width=1910) - predicate:((s_state) IN ('SD', 'FL', 'MI', 'LA', 'MO', 'SC') and s_store_sk is not null) - TableScan [TS_9] (rows=1704 width=1910) - default@store,store,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_state"] - <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_21] - PartitionCols:_col3 - Merge Join Operator [MERGEJOIN_57] (rows=696954748 width=88) - Conds:RS_18._col0=RS_19._col0(Inner),Output:["_col1","_col3","_col4","_col5","_col6","_col7"] - <-Map 9 [SIMPLE_EDGE] - SHUFFLE [RS_19] - PartitionCols:_col0 - Select Operator [SEL_8] (rows=36524 width=1119) - Output:["_col0"] - Filter Operator [FIL_53] (rows=36524 width=1119) - predicate:((d_year = 2001) and d_date_sk is not null) - TableScan [TS_6] (rows=73049 width=1119) - default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_year"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_18] - PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_56] (rows=633595212 width=88) - Conds:RS_15._col2=RS_16._col0(Inner),Output:["_col0","_col1","_col3","_col4","_col5","_col6","_col7"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_15] - PartitionCols:_col2 - Select Operator [SEL_2] (rows=575995635 width=88) - Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7"] - Filter Operator [FIL_51] (rows=575995635 width=88) - predicate:(ss_cdemo_sk is not null and ss_item_sk is not null and ss_sold_date_sk is not null and ss_store_sk is not null) - TableScan [TS_0] (rows=575995635 width=88) - default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_item_sk","ss_cdemo_sk","ss_store_sk","ss_quantity","ss_list_price","ss_sales_price","ss_coupon_amt"] - <-Map 8 [SIMPLE_EDGE] - SHUFFLE [RS_16] - PartitionCols:_col0 - Select Operator [SEL_5] (rows=232725 width=385) - Output:["_col0"] - Filter Operator [FIL_52] (rows=232725 width=385) - predicate:((cd_education_status = '2 yr Degree') and (cd_gender = 'M') and (cd_marital_status = 'U') and cd_demo_sk is not null) - TableScan [TS_3] (rows=1861800 width=385) - default@customer_demographics,customer_demographics,Tbl:COMPLETE,Col:NONE,Output:["cd_demo_sk","cd_gender","cd_marital_status","cd_education_status"] + Filter Operator [FIL_55] (rows=462000 width=1436) + predicate:i_item_sk is not null + Top N Key Operator [TNK_60] (rows=462000 width=1436) + key column names:["i_item_id"],keys:i_item_id,order:+,top n:100 + TableScan [TS_12] (rows=462000 width=1436) + default@item,item,Tbl:COMPLETE,Col:NONE,Output:["i_item_sk","i_item_id"] + <-Reducer 4 [SIMPLE_EDGE] + SHUFFLE [RS_24] + PartitionCols:_col1 + Top N Key Operator [TNK_58] (rows=766650239 width=88) + key column names:["_col15"],keys:_col15,order:+,top n:100 + Merge Join Operator [MERGEJOIN_63] (rows=766650239 width=88) + Conds:RS_21._col3=RS_22._col0(Inner),Output:["_col1","_col4","_col5","_col6","_col7","_col15"] + <-Map 10 [SIMPLE_EDGE] + SHUFFLE [RS_22] + PartitionCols:_col0 + Select Operator [SEL_11] (rows=852 width=1910) + Output:["_col0","_col1"] + Filter Operator [FIL_54] (rows=852 width=1910) + predicate:((s_state) IN ('SD', 'FL', 'MI', 'LA', 'MO', 'SC') and s_store_sk is not null) + Top N Key Operator [TNK_59] (rows=1704 width=1910) + key column names:["s_state"],keys:s_state,order:+,top n:100 + TableScan [TS_9] (rows=1704 width=1910) + default@store,store,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_state"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_21] + PartitionCols:_col3 + Merge Join Operator [MERGEJOIN_62] (rows=696954748 width=88) + Conds:RS_18._col0=RS_19._col0(Inner),Output:["_col1","_col3","_col4","_col5","_col6","_col7"] + <-Map 9 [SIMPLE_EDGE] + SHUFFLE [RS_19] + PartitionCols:_col0 + Select Operator [SEL_8] (rows=36524 width=1119) + Output:["_col0"] + Filter Operator [FIL_53] (rows=36524 width=1119) + predicate:((d_year = 2001) and d_date_sk is not null) + TableScan [TS_6] (rows=73049 width=1119) + default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_year"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_18] + PartitionCols:_col0 + Merge Join Operator [MERGEJOIN_61] (rows=633595212 width=88) + Conds:RS_15._col2=RS_16._col0(Inner),Output:["_col0","_col1","_col3","_col4","_col5","_col6","_col7"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_15] + PartitionCols:_col2 + Select Operator [SEL_2] (rows=575995635 width=88) + Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7"] + Filter Operator [FIL_51] (rows=575995635 width=88) + predicate:(ss_cdemo_sk is not null and ss_item_sk is not null and ss_sold_date_sk is not null and ss_store_sk is not null) + TableScan [TS_0] (rows=575995635 width=88) + default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_item_sk","ss_cdemo_sk","ss_store_sk","ss_quantity","ss_list_price","ss_sales_price","ss_coupon_amt"] + <-Map 8 [SIMPLE_EDGE] + SHUFFLE [RS_16] + PartitionCols:_col0 + Select Operator [SEL_5] (rows=232725 width=385) + Output:["_col0"] + Filter Operator [FIL_52] (rows=232725 width=385) + predicate:((cd_education_status = '2 yr Degree') and (cd_gender = 'M') and (cd_marital_status = 'U') and cd_demo_sk is not null) + TableScan [TS_3] (rows=1861800 width=385) + default@customer_demographics,customer_demographics,Tbl:COMPLETE,Col:NONE,Output:["cd_demo_sk","cd_gender","cd_marital_status","cd_education_status"] diff --git ql/src/test/results/clientpositive/tez/topn_key.q.out ql/src/test/results/clientpositive/tez/topn_key.q.out new file mode 100644 index 0000000000..53a7ea49de --- /dev/null +++ ql/src/test/results/clientpositive/tez/topn_key.q.out @@ -0,0 +1,183 @@ +PREHOOK: query: EXPLAIN +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:5 + Stage-1 + Reducer 3 + File Output Operator [FS_10] + Limit [LIM_9] (rows=5 width=95) + Number of rows:5 + Select Operator [SEL_8] (rows=250 width=95) + Output:["_col0","_col1"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_7] + Group By Operator [GBY_5] (rows=250 width=95) + Output:["_col0","_col1"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0 + Top N Key Operator [TNK_11] (rows=250 width=95) + key column names:["KEY._col0"],keys:KEY._col0,order:+,top n:5 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_4] + PartitionCols:_col0 + Group By Operator [GBY_3] (rows=250 width=95) + Output:["_col0","_col1"],aggregations:["sum(_col1)"],keys:_col0 + Select Operator [SEL_1] (rows=500 width=178) + Output:["_col0","_col1"] + Top N Key Operator [TNK_12] (rows=500 width=178) + key column names:["key"],keys:key,order:+,top n:5 + TableScan [TS_0] (rows=500 width=178) + default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + +PREHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 200.0 +103 206.0 +104 208.0 +PREHOOK: query: EXPLAIN +SELECT src.key FROM src GROUP BY src.key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT src.key FROM src GROUP BY src.key LIMIT 5 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:5 + Stage-1 + Reducer 2 + File Output Operator [FS_7] + Limit [LIM_6] (rows=5 width=87) + Number of rows:5 + Group By Operator [GBY_4] (rows=250 width=87) + Output:["_col0"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_3] + PartitionCols:_col0 + Group By Operator [GBY_2] (rows=250 width=87) + Output:["_col0"],keys:key + Select Operator [SEL_1] (rows=500 width=87) + Output:["key"] + Top N Key Operator [TNK_8] (rows=500 width=87) + key column names:["key"],keys:key,order:+,top n:5 + TableScan [TS_0] (rows=500 width=87) + default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key"] + +PREHOOK: query: SELECT src.key FROM src GROUP BY src.key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src.key FROM src GROUP BY src.key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 +10 +100 +103 +104 +PREHOOK: query: EXPLAIN +SELECT src1.key AS k, SUM(SUBSTR(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT src1.key AS k, SUM(SUBSTR(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Reducer 3 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:5 + Stage-1 + Reducer 4 + File Output Operator [FS_18] + Limit [LIM_17] (rows=5 width=95) + Number of rows:5 + Select Operator [SEL_16] (rows=309 width=95) + Output:["_col0","_col1"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_15] + Group By Operator [GBY_13] (rows=309 width=95) + Output:["_col0","_col1"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0 + Top N Key Operator [TNK_23] (rows=309 width=95) + key column names:["KEY._col0"],keys:KEY._col0,order:+,top n:5 + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_12] + PartitionCols:_col0 + Group By Operator [GBY_11] (rows=309 width=95) + Output:["_col0","_col1"],aggregations:["sum(_col1)"],keys:_col0 + Select Operator [SEL_9] (rows=809 width=178) + Output:["_col0","_col1"] + Top N Key Operator [TNK_24] (rows=809 width=178) + key column names:["_col0"],keys:_col0,order:+,top n:5 + Merge Join Operator [MERGEJOIN_27] (rows=809 width=178) + Conds:RS_6._col0=RS_7._col0(Inner),Output:["_col0","_col2"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_6] + PartitionCols:_col0 + Select Operator [SEL_2] (rows=500 width=87) + Output:["_col0"] + Filter Operator [FIL_21] (rows=500 width=87) + predicate:key is not null + Top N Key Operator [TNK_25] (rows=500 width=87) + key column names:["key"],keys:key,order:+,top n:5 + TableScan [TS_0] (rows=500 width=87) + default@src,src1,Tbl:COMPLETE,Col:COMPLETE,Output:["key"] + <-Map 5 [SIMPLE_EDGE] + SHUFFLE [RS_7] + PartitionCols:_col0 + Select Operator [SEL_5] (rows=500 width=178) + Output:["_col0","_col1"] + Filter Operator [FIL_22] (rows=500 width=178) + predicate:key is not null + Top N Key Operator [TNK_26] (rows=500 width=178) + key column names:["key"],keys:key,order:+,top n:5 + TableScan [TS_3] (rows=500 width=178) + default@src,src2,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + +PREHOOK: query: SELECT src1.key AS k, sum(substr(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src1.key AS k, sum(substr(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 400.0 +103 412.0 +104 416.0 diff --git ql/src/test/results/clientpositive/tez/vector_topn_key.q.out ql/src/test/results/clientpositive/tez/vector_topn_key.q.out new file mode 100644 index 0000000000..a79e63df10 --- /dev/null +++ ql/src/test/results/clientpositive/tez/vector_topn_key.q.out @@ -0,0 +1,270 @@ +PREHOOK: query: explain vectorization +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:5 + Stage-1 + Reducer 3 vectorized + File Output Operator [FS_17] + Limit [LIM_16] (rows=5 width=95) + Number of rows:5 + Select Operator [SEL_15] (rows=250 width=95) + Output:["_col0","_col1"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_7] + Group By Operator [GBY_5] (rows=250 width=95) + Output:["_col0","_col1"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0 + Top N Key Operator [TNK_11] (rows=250 width=95) + key column names:["KEY._col0"],keys:KEY._col0,order:+,top n:5 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_4] + PartitionCols:_col0 + Group By Operator [GBY_3] (rows=250 width=95) + Output:["_col0","_col1"],aggregations:["sum(_col1)"],keys:_col0 + Select Operator [SEL_1] (rows=500 width=178) + Output:["_col0","_col1"] + Top N Key Operator [TNK_12] (rows=500 width=178) + key column names:["key"],keys:key,order:+,top n:5 + TableScan [TS_0] (rows=500 width=178) + default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + +PREHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 200.0 +103 206.0 +104 208.0 +PREHOOK: query: explain vectorization +SELECT key FROM src GROUP BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization +SELECT key FROM src GROUP BY key LIMIT 5 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:5 + Stage-1 + Reducer 2 vectorized + File Output Operator [FS_15] + Limit [LIM_14] (rows=5 width=87) + Number of rows:5 + Group By Operator [GBY_13] (rows=250 width=87) + Output:["_col0"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] vectorized + SHUFFLE [RS_12] + PartitionCols:_col0 + Group By Operator [GBY_11] (rows=250 width=87) + Output:["_col0"],keys:key + Select Operator [SEL_10] (rows=500 width=87) + Output:["key"] + Top N Key Operator [TNK_9] (rows=500 width=87) + key column names:["key"],keys:key,order:+,top n:5 + TableScan [TS_0] (rows=500 width=87) + default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key"] + +PREHOOK: query: SELECT key FROM src GROUP BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key FROM src GROUP BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +92 +95 +96 +97 +98 +Warning: Shuffle Join MERGEJOIN[15][tables = [src1, src2]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: EXPLAIN VECTORIZATION +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN VECTORIZATION +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +POSTHOOK: type: QUERY +Plan not optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (XPROD_EDGE), Map 4 (XPROD_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 3 vectorized + File Output Operator [FS_23] + Select Operator [SEL_22] (rows=27556 width=356) + Output:["_col0","_col1","_col2","_col3"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_10] + Merge Join Operator [MERGEJOIN_15] (rows=27556 width=356) + Conds:(Inner),Output:["_col0","_col1","_col2","_col3"] + <-Map 1 [XPROD_EDGE] vectorized + XPROD_EDGE [RS_18] + Select Operator [SEL_17] (rows=166 width=178) + Output:["_col0","_col1"] + Filter Operator [FIL_16] (rows=166 width=178) + predicate:(key < 10) + TableScan [TS_0] (rows=500 width=178) + default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + <-Map 4 [XPROD_EDGE] vectorized + XPROD_EDGE [RS_21] + Select Operator [SEL_20] (rows=166 width=178) + Output:["_col0","_col1"] + Filter Operator [FIL_19] (rows=166 width=178) + predicate:(key < 10) + TableScan [TS_3] (rows=500 width=178) + default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + +Warning: Shuffle Join MERGEJOIN[15][tables = [src1, src2]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 2 val_2 +0 val_0 2 val_2 +0 val_0 2 val_2 +0 val_0 4 val_4 +0 val_0 4 val_4 +0 val_0 4 val_4 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 8 val_8 +0 val_0 8 val_8 +0 val_0 8 val_8 +0 val_0 9 val_9 +0 val_0 9 val_9 +0 val_0 9 val_9 +2 val_2 0 val_0 +2 val_2 0 val_0 +2 val_2 0 val_0 +2 val_2 2 val_2 +2 val_2 4 val_4 +2 val_2 5 val_5 +2 val_2 5 val_5 +2 val_2 5 val_5 +2 val_2 8 val_8 +2 val_2 9 val_9 +4 val_4 0 val_0 +4 val_4 0 val_0 +4 val_4 0 val_0 +4 val_4 2 val_2 +4 val_4 4 val_4 +4 val_4 5 val_5 +4 val_4 5 val_5 +4 val_4 5 val_5 +4 val_4 8 val_8 +4 val_4 9 val_9 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 2 val_2 +5 val_5 2 val_2 +5 val_5 2 val_2 +5 val_5 4 val_4 +5 val_5 4 val_4 +5 val_5 4 val_4 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 8 val_8 +5 val_5 8 val_8 +5 val_5 8 val_8 +5 val_5 9 val_9 +5 val_5 9 val_9 +5 val_5 9 val_9 +8 val_8 0 val_0 +8 val_8 0 val_0 +8 val_8 0 val_0 +8 val_8 2 val_2 +8 val_8 4 val_4 +8 val_8 5 val_5 +8 val_8 5 val_5 +8 val_8 5 val_5 +8 val_8 8 val_8 +8 val_8 9 val_9 +9 val_9 0 val_0 +9 val_9 0 val_0 +9 val_9 0 val_0 +9 val_9 2 val_2 +9 val_9 4 val_4 +9 val_9 5 val_5 +9 val_9 5 val_5 +9 val_9 5 val_5 +9 val_9 8 val_8 +9 val_9 9 val_9 diff --git ql/src/test/results/clientpositive/topn_key.q.out ql/src/test/results/clientpositive/topn_key.q.out new file mode 100644 index 0000000000..4977a75bdb --- /dev/null +++ ql/src/test/results/clientpositive/topn_key.q.out @@ -0,0 +1,313 @@ +PREHOOK: query: EXPLAIN +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), substr(value, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 200.0 +103 206.0 +104 208.0 +PREHOOK: query: EXPLAIN +SELECT src.key FROM src GROUP BY src.key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT src.key FROM src GROUP BY src.key LIMIT 5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT src.key FROM src GROUP BY src.key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src.key FROM src GROUP BY src.key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 +10 +100 +103 +104 +PREHOOK: query: EXPLAIN +SELECT src1.key AS k, SUM(SUBSTR(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT src1.key AS k, SUM(SUBSTR(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + TableScan + alias: src2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), substr(_col2, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT src1.key AS k, sum(substr(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src1.key AS k, sum(substr(src2.value,5)) AS v FROM + src AS src1 JOIN src AS src2 ON (src1.key = src2.key) + GROUP BY src1.key ORDER BY k LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 400.0 +103 412.0 +104 416.0 diff --git ql/src/test/results/clientpositive/vector_topn_key.q.out ql/src/test/results/clientpositive/vector_topn_key.q.out new file mode 100644 index 0000000000..7fbdd56567 --- /dev/null +++ ql/src/test/results/clientpositive/vector_topn_key.q.out @@ -0,0 +1,449 @@ +PREHOOK: query: explain vectorization +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization +SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), substr(value, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + notVectorizedReason: GROUPBY operator: Vector aggregation : "sum" for input type: "BYTES" and output type: "DOUBLE" and mode: PARTIAL1 not supported for evaluator GenericUDAFSumDouble + vectorized: false + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: double) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, sum(substr(value,5)) FROM src GROUP BY key ORDER BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 0.0 +10 10.0 +100 200.0 +103 206.0 +104 208.0 +PREHOOK: query: explain vectorization +SELECT key FROM src GROUP BY key LIMIT 5 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization +SELECT key FROM src GROUP BY key LIMIT 5 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink + +PREHOOK: query: SELECT key FROM src GROUP BY key LIMIT 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT key FROM src GROUP BY key LIMIT 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 +10 +100 +103 +104 +Warning: Shuffle Join JOIN[8][tables = [src1, src2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: EXPLAIN VECTORIZATION +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN VECTORIZATION +SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key < 10) (type: boolean) + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key < 10) (type: boolean) + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Map Vectorization: + enabled: false + enabledConditionsNotMet: Vectorized map work only works with 1 TableScanOperator IS false + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 27556 Data size: 612872 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + sort order: ++++ + Statistics: Num rows: 27556 Data size: 612872 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 27556 Data size: 612872 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 27556 Data size: 612872 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join JOIN[8][tables = [src1, src2]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT src1.key as k1, src1.value as v1, + src2.key as k2, src2.value as v2 FROM + (SELECT * FROM src WHERE src.key < 10) src1 + JOIN + (SELECT * FROM src WHERE src.key < 10) src2 + SORT BY k1, v1, k2, v2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 0 val_0 +0 val_0 2 val_2 +0 val_0 2 val_2 +0 val_0 2 val_2 +0 val_0 4 val_4 +0 val_0 4 val_4 +0 val_0 4 val_4 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 5 val_5 +0 val_0 8 val_8 +0 val_0 8 val_8 +0 val_0 8 val_8 +0 val_0 9 val_9 +0 val_0 9 val_9 +0 val_0 9 val_9 +2 val_2 0 val_0 +2 val_2 0 val_0 +2 val_2 0 val_0 +2 val_2 2 val_2 +2 val_2 4 val_4 +2 val_2 5 val_5 +2 val_2 5 val_5 +2 val_2 5 val_5 +2 val_2 8 val_8 +2 val_2 9 val_9 +4 val_4 0 val_0 +4 val_4 0 val_0 +4 val_4 0 val_0 +4 val_4 2 val_2 +4 val_4 4 val_4 +4 val_4 5 val_5 +4 val_4 5 val_5 +4 val_4 5 val_5 +4 val_4 8 val_8 +4 val_4 9 val_9 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 0 val_0 +5 val_5 2 val_2 +5 val_5 2 val_2 +5 val_5 2 val_2 +5 val_5 4 val_4 +5 val_5 4 val_4 +5 val_5 4 val_4 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 5 val_5 +5 val_5 8 val_8 +5 val_5 8 val_8 +5 val_5 8 val_8 +5 val_5 9 val_9 +5 val_5 9 val_9 +5 val_5 9 val_9 +8 val_8 0 val_0 +8 val_8 0 val_0 +8 val_8 0 val_0 +8 val_8 2 val_2 +8 val_8 4 val_4 +8 val_8 5 val_5 +8 val_8 5 val_5 +8 val_8 5 val_5 +8 val_8 8 val_8 +8 val_8 9 val_9 +9 val_9 0 val_0 +9 val_9 0 val_0 +9 val_9 0 val_0 +9 val_9 2 val_2 +9 val_9 4 val_4 +9 val_9 5 val_5 +9 val_9 5 val_5 +9 val_9 5 val_5 +9 val_9 8 val_8 +9 val_9 9 val_9 diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java index 698ebe1ffe..8bec504597 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java @@ -783,6 +783,25 @@ public static int compare(Object[] o1, ObjectInspector[] oi1, Object[] o2, return 0; } + public static int compare(Object[] o1, ObjectInspector[] oi1, Object[] o2, + ObjectInspector[] oi2, boolean[] columnSortOrderIsDesc) { + assert (o1.length == oi1.length); + assert (o2.length == oi2.length); + assert (o1.length == o2.length); + + for (int i = 0; i < o1.length; i++) { + int r = compare(o1[i], oi1[i], o2[i], oi2[i]); + if (r != 0) { + if (columnSortOrderIsDesc[i]) { + return r; + } else { + return -r; + } + } + } + return 0; + } + /** * Whether comparison is supported for this type. * Currently all types that references any map are not comparable.