diff --git data/files/vector_ptf_part_simple.txt data/files/vector_ptf_part_simple.txt new file mode 100644 index 0000000..2bcc7a6 --- /dev/null +++ data/files/vector_ptf_part_simple.txt @@ -0,0 +1,40 @@ +Manufacturer#2 almond aquamarine rose maroon antique 900.66 +Manufacturer#1 almond aquamarine burnished black steel 1414.42 +Manufacturer#2 almond aquamarine rose maroon antique 1698.66 +Manufacturer#1 almond aquamarine pink moccasin thistle \N +Manufacturer#1 almond antique chartreuse lavender yellow 1753.76 +Manufacturer#5 almond antique medium spring khaki 1611.66 +Manufacturer#5 almond antique blue firebrick mint 1789.69 +Manufacturer#1 almond antique burnished rose metallic 1173.15 +Manufacturer#1 almond aquamarine pink moccasin thistle 1632.66 +Manufacturer#3 almond antique forest lavender goldenrod 1190.27 +Manufacturer#4 almond aquamarine yellow dodger mint 1844.92 +Manufacturer#1 almond antique chartreuse lavender yellow 1753.76 +Manufacturer#2 almond antique violet turquoise frosted 1800.7 +Manufacturer#3 almond antique forest lavender goldenrod \N +Manufacturer#2 almond antique violet chocolate turquoise 1690.68 +Manufacturer#4 almond antique violet mint lemon 1375.42 +Manufacturer#1 almond aquamarine pink moccasin thistle 1632.66 +Manufacturer#5 almond azure blanched chiffon midnight 1464.48 +Manufacturer#3 almond antique forest lavender goldenrod 590.27 +Manufacturer#1 almond antique chartreuse lavender yellow 1753.76 +Manufacturer#2 almond antique violet turquoise frosted 1800.7 +Manufacturer#5 almond antique sky peru orange 1788.73 +Manufacturer#1 almond aquamarine pink moccasin thistle 1632.66 +Manufacturer#3 almond antique chartreuse khaki white 99.68 +Manufacturer#4 almond antique gainsboro frosted violet \N +Manufacturer#1 almond antique chartreuse lavender yellow 1753.76 +Manufacturer#2 almond antique violet turquoise frosted 1800.7 +Manufacturer#3 almond antique olive coral navajo 1337.29 +Manufacturer#5 almond antique medium spring khaki 1611.66 +Manufacturer#1 almond antique salmon chartreuse burlywood 1602.59 +Manufacturer#3 almond antique misty red olive 1922.98 +Manufacturer#2 almond aquamarine sandy cyan gainsboro 1000.6 +Manufacturer#3 almond antique forest lavender goldenrod 1190.27 +Manufacturer#2 almond aquamarine midnight light salmon 2031.98 +Manufacturer#4 almond aquamarine floral ivory bisque \N +Manufacturer#5 almond aquamarine dodger light gainsboro 1018.1 +Manufacturer#4 almond azure aquamarine papaya violet 1290.35 +Manufacturer#3 almond antique metallic orange dim 55.39 +Manufacturer#1 almond antique burnished rose metallic 1173.15 +Manufacturer#4 almond aquamarine floral ivory bisque 1206.26 diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index df80478..fbebcaa 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -635,6 +635,16 @@ minillaplocal.query.files=acid_globallimit.q,\ vector_number_compare_projection.q,\ vector_partitioned_date_time.q,\ vector_udf1.q,\ + vector_windowing_expressions.q,\ + vector_windowing_gby.q,\ + vector_windowing_gby2.q,\ + vector_windowing_multipartitioning.q,\ + vector_windowing_order_null.q,\ + vector_windowing_range_multiorder.q,\ + vector_windowing_rank.q,\ + vector_windowing_streaming.q,\ + vector_windowing_windowspec.q,\ + vector_windowing_windowspec4.q,\ vectorization_short_regress.q,\ vectorized_dynamic_partition_pruning.q,\ vectorized_dynamic_semijoin_reduction.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 8b04cd4..ac35f91 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -632,6 +632,13 @@ public void endGroup() throws HiveException { defaultEndGroup(); } + // Tell the operator the status of the next key-grouped VectorizedRowBatch that will be delivered + // to the process method. E.g. by reduce-shuffle. These semantics are needed by PTF so it can + // efficiently add computed values to the last batch of a group key. + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + // Do nothing. + } + // an blocking operator (e.g. GroupByOperator and JoinOperator) can // override this method to forward its outputs public void flush() throws HiveException { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java index afe1484..c5a4217 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorFactory.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorSparkPartitionPruningSinkOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkCommonOperator; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFOperator; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc; import org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator; @@ -139,6 +140,7 @@ vectorOpvec.put(FileSinkDesc.class, VectorFileSinkOperator.class); vectorOpvec.put(FilterDesc.class, VectorFilterOperator.class); vectorOpvec.put(LimitDesc.class, VectorLimitOperator.class); + vectorOpvec.put(PTFDesc.class, VectorPTFOperator.class); vectorOpvec.put(SparkHashTableSinkDesc.class, VectorSparkHashTableSinkOperator.class); } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java index 60660ac..548f1fc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordSource.java @@ -379,20 +379,6 @@ private boolean pushRecordVector() { BytesWritable keyWritable = (BytesWritable) reader.getCurrentKey(); valueWritables = reader.getCurrentValues(); - // Check if this is a new group or same group - if (handleGroupKey && !keyWritable.equals(this.groupKey)) { - // If a operator wants to do some work at the beginning of a group - if (groupKey == null) { // the first group - this.groupKey = new BytesWritable(); - } else { - // If a operator wants to do some work at the end of a group - reducer.endGroup(); - } - - groupKey.set(keyWritable.getBytes(), 0, keyWritable.getLength()); - reducer.startGroup(); - } - processVectorGroup(keyWritable, valueWritables, tag); return true; } catch (Throwable e) { @@ -408,15 +394,20 @@ private boolean pushRecordVector() { } /** + * + * @param keyWritable * @param values - * @return true if it is not done and can take more inputs + * @param tag + * @throws HiveException + * @throws IOException */ private void processVectorGroup(BytesWritable keyWritable, Iterable values, byte tag) throws HiveException, IOException { + Preconditions.checkState(batch.size == 0); + // Deserialize key into vector row columns. - // Since we referencing byte column vector byte arrays by reference, we don't need - // a data buffer. + // byte[] keyBytes = keyWritable.getBytes(); int keyLength = keyWritable.getLength(); @@ -442,6 +433,24 @@ private void processVectorGroup(BytesWritable keyWritable, int batchBytes = keyBytes.length; try { for (Object value : values) { + if (rowIdx >= maxSize || + (rowIdx > 0 && batchBytes >= BATCH_BYTES)) { + + // Batch is full AND we have at least 1 more row... + batch.size = rowIdx; + if (handleGroupKey) { + reducer.setNextVectorBatchGroupStatus(/* isLastGroupBatch */ false); + } + reducer.process(batch, tag); + + // Reset just the value columns and value buffer. + for (int i = firstValueColumnOffset; i < batch.numCols; i++) { + // Note that reset also resets the data buffer for bytes column vectors. + batch.cols[i].reset(); + } + rowIdx = 0; + batchBytes = keyBytes.length; + } if (valueLazyBinaryDeserializeToRow != null) { // Deserialize value into vector row columns. BytesWritable valueWritable = (BytesWritable) value; @@ -456,24 +465,13 @@ private void processVectorGroup(BytesWritable keyWritable, valueLazyBinaryDeserializeToRow.deserialize(batch, rowIdx); } rowIdx++; - if (rowIdx >= maxSize || batchBytes >= BATCH_BYTES) { - - // Batch is full. - batch.size = rowIdx; - reducer.process(batch, tag); - - // Reset just the value columns and value buffer. - for (int i = firstValueColumnOffset; i < batch.numCols; i++) { - // Note that reset also resets the data buffer for bytes column vectors. - batch.cols[i].reset(); - } - rowIdx = 0; - batchBytes = 0; - } } if (rowIdx > 0) { // Flush final partial batch. - VectorizedBatchUtil.setBatchSize(batch, rowIdx); + batch.size = rowIdx; + if (handleGroupKey) { + reducer.setNextVectorBatchGroupStatus(/* isLastGroupBatch */ true); + } reducer.process(batch, tag); } batch.reset(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java index 4b76d74..b391cf2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java @@ -148,8 +148,7 @@ */ private static interface IProcessingMode { public void initialize(Configuration hconf) throws HiveException; - public void startGroup() throws HiveException; - public void endGroup() throws HiveException; + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException; public void processBatch(VectorizedRowBatch batch) throws HiveException; public void close(boolean aborted) throws HiveException; } @@ -159,14 +158,10 @@ */ private abstract class ProcessingModeBase implements IProcessingMode { - // Overridden and used in sorted reduce group batch processing mode. + // Overridden and used in ProcessingModeReduceMergePartial mode. @Override - public void startGroup() throws HiveException { - // Do nothing. - } - @Override - public void endGroup() throws HiveException { - // Do nothing. + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + throw new HiveException("Status call for next key-grouped VectorizedRowBatch not expected for ProcssingMode class " + this.getClass().getName()); } protected abstract void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet, @@ -258,6 +253,11 @@ public void initialize(Configuration hconf) throws HiveException { } @Override + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + // Do nothing. + } + + @Override public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet, boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException { for (int i = 0; i < aggregators.length; ++i) { @@ -674,6 +674,11 @@ public void free(VectorAggregationBufferRow t) { } @Override + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + // Do nothing. + } + + @Override public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet, boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException { @@ -762,8 +767,8 @@ public void close(boolean aborted) throws HiveException { */ private class ProcessingModeReduceMergePartial extends ProcessingModeBase { - private boolean inGroup; private boolean first; + private boolean isLastGroupBatch; /** * The group vector key helper. @@ -782,7 +787,7 @@ public void close(boolean aborted) throws HiveException { @Override public void initialize(Configuration hconf) throws HiveException { - inGroup = false; + isLastGroupBatch = true; // We do not include the dummy grouping set column in the output. So we pass outputKeyLength // instead of keyExpressions.length @@ -794,24 +799,16 @@ public void initialize(Configuration hconf) throws HiveException { } @Override - public void startGroup() throws HiveException { - inGroup = true; - first = true; - } - - @Override - public void endGroup() throws HiveException { - if (inGroup && !first) { - writeGroupRow(groupAggregators, buffer); - groupAggregators.reset(); + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + if (this.isLastGroupBatch) { + first = true; } - inGroup = false; + this.isLastGroupBatch = isLastGroupBatch; } @Override public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet, boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException { - assert(inGroup); if (first) { // Copy the group key to output batch now. We'll copy in the aggregates at the end of the group. first = false; @@ -828,11 +825,16 @@ public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet, for (int i = 0; i < aggregators.length; ++i) { aggregators[i].aggregateInput(groupAggregators.getAggregationBuffer(i), batch); } + + if (isLastGroupBatch) { + writeGroupRow(groupAggregators, buffer); + groupAggregators.reset(); + } } @Override public void close(boolean aborted) throws HiveException { - if (!aborted && inGroup && !first) { + if (!aborted && !first && !isLastGroupBatch) { writeGroupRow(groupAggregators, buffer); } } @@ -1003,21 +1005,26 @@ private void changeToStreamingMode() throws HiveException { } @Override + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + processingMode.setNextVectorBatchGroupStatus(isLastGroupBatch); + } + + @Override public void startGroup() throws HiveException { - processingMode.startGroup(); // We do not call startGroup on operators below because we are batching rows in // an output batch and the semantics will not work. // super.startGroup(); + throw new HiveException("Unexpected startGroup"); } @Override public void endGroup() throws HiveException { - processingMode.endGroup(); // We do not call endGroup on operators below because we are batching rows in // an output batch and the semantics will not work. // super.endGroup(); + throw new HiveException("Unexpected startGroup"); } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java index 5c490ef..992cbce 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSelectOperator.java @@ -110,6 +110,14 @@ protected void initializeOp(Configuration hconf) throws HiveException { outputFieldNames, objectInspectors); } + // Must send on to VectorPTFOperator... + @Override + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + for (Operator op : childOperators) { + op.setNextVectorBatchGroupStatus(isLastGroupBatch); + } + } + @Override public void process(Object row, int tag) throws HiveException { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index c3940cb..a9aba56 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -406,7 +406,7 @@ protected boolean needsImplicitCastForDecimal(GenericUDF udf) { return udfsNeedingImplicitDecimalCast.contains(udfClass); } - protected int getInputColumnIndex(String name) throws HiveException { + public int getInputColumnIndex(String name) throws HiveException { if (name == null) { throw new HiveException("Null column name"); } @@ -438,7 +438,7 @@ protected OutputColumnManager(int initialOutputCol) { private final Set usedOutputColumns = new HashSet(); - int allocateOutputColumn(TypeInfo typeInfo) throws HiveException { + int allocateOutputColumn(TypeInfo typeInfo) { if (initialOutputCol < 0) { // This is a test calling. return 0; @@ -499,7 +499,7 @@ void freeOutputColumn(int index) { } } - public int allocateScratchColumn(TypeInfo typeInfo) throws HiveException { + public int allocateScratchColumn(TypeInfo typeInfo) { return ocm.allocateOutputColumn(typeInfo); } @@ -2635,7 +2635,7 @@ private Timestamp evaluateCastToTimestamp(ExprNodeDesc expr) throws HiveExceptio } } - static String getScratchName(TypeInfo typeInfo) throws HiveException { + static String getScratchName(TypeInfo typeInfo) { // For now, leave DECIMAL precision/scale in the name so DecimalColumnVector scratch columns // don't need their precision/scale adjusted... if (typeInfo.getCategory() == Category.PRIMITIVE && diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index 990e896..03c09e7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -579,7 +579,7 @@ public static StandardStructObjectInspector convertToStandardStructObjectInspect return typeInfoList.toArray(new TypeInfo[0]); } - static ColumnVector cloneColumnVector(ColumnVector source + public static ColumnVector makeLikeColumnVector(ColumnVector source ) throws HiveException{ if (source instanceof LongColumnVector) { return new LongColumnVector(((LongColumnVector) source).vector.length); @@ -598,25 +598,25 @@ static ColumnVector cloneColumnVector(ColumnVector source return new IntervalDayTimeColumnVector(((IntervalDayTimeColumnVector) source).getLength()); } else if (source instanceof ListColumnVector) { ListColumnVector src = (ListColumnVector) source; - ColumnVector child = cloneColumnVector(src.child); + ColumnVector child = makeLikeColumnVector(src.child); return new ListColumnVector(src.offsets.length, child); } else if (source instanceof MapColumnVector) { MapColumnVector src = (MapColumnVector) source; - ColumnVector keys = cloneColumnVector(src.keys); - ColumnVector values = cloneColumnVector(src.values); + ColumnVector keys = makeLikeColumnVector(src.keys); + ColumnVector values = makeLikeColumnVector(src.values); return new MapColumnVector(src.offsets.length, keys, values); } else if (source instanceof StructColumnVector) { StructColumnVector src = (StructColumnVector) source; ColumnVector[] copy = new ColumnVector[src.fields.length]; for(int i=0; i < copy.length; ++i) { - copy[i] = cloneColumnVector(src.fields[i]); + copy[i] = makeLikeColumnVector(src.fields[i]); } return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, copy); } else if (source instanceof UnionColumnVector) { UnionColumnVector src = (UnionColumnVector) source; ColumnVector[] copy = new ColumnVector[src.fields.length]; for(int i=0; i < copy.length; ++i) { - copy[i] = cloneColumnVector(src.fields[i]); + copy[i] = makeLikeColumnVector(src.fields[i]); } return new UnionColumnVector(src.tags.length, copy); } else @@ -625,6 +625,53 @@ static ColumnVector cloneColumnVector(ColumnVector source " is not supported!"); } + public static void swapColumnVector( + VectorizedRowBatch batch1, int batch1ColumnNum, + VectorizedRowBatch batch2, int batch2ColumnNum) { + ColumnVector colVector1 = batch1.cols[batch1ColumnNum]; + batch1.cols[batch1ColumnNum] = batch2.cols[batch2ColumnNum]; + batch2.cols[batch2ColumnNum] = colVector1; + } + + public static void copyRepeatingColumn(VectorizedRowBatch sourceBatch, int sourceColumnNum, + VectorizedRowBatch targetBatch, int targetColumnNum, boolean setByValue) { + ColumnVector sourceColVector = sourceBatch.cols[sourceColumnNum]; + ColumnVector targetColVector = targetBatch.cols[targetColumnNum]; + + targetColVector.isRepeating = true; + + if (!sourceColVector.noNulls) { + targetColVector.noNulls = false; + targetColVector.isNull[0] = true; + return; + } + + if (sourceColVector instanceof LongColumnVector) { + ((LongColumnVector) targetColVector).vector[0] = ((LongColumnVector) sourceColVector).vector[0]; + } else if (sourceColVector instanceof DoubleColumnVector) { + ((DoubleColumnVector) targetColVector).vector[0] = ((DoubleColumnVector) sourceColVector).vector[0]; + } else if (sourceColVector instanceof BytesColumnVector) { + BytesColumnVector bytesColVector = (BytesColumnVector) sourceColVector; + byte[] bytes = bytesColVector.vector[0]; + final int start = bytesColVector.start[0]; + final int length = bytesColVector.length[0]; + if (setByValue) { + ((BytesColumnVector) targetColVector).setVal(0, bytes, start, length); + } else { + ((BytesColumnVector) targetColVector).setRef(0, bytes, start, length); + } + } else if (sourceColVector instanceof DecimalColumnVector) { + ((DecimalColumnVector) targetColVector).set(0, ((DecimalColumnVector) sourceColVector).vector[0]); + } else if (sourceColVector instanceof TimestampColumnVector) { + ((TimestampColumnVector) targetColVector).set(0, ((TimestampColumnVector) sourceColVector).asScratchTimestamp(0)); + } else if (sourceColVector instanceof IntervalDayTimeColumnVector) { + ((IntervalDayTimeColumnVector) targetColVector).set(0, ((IntervalDayTimeColumnVector) sourceColVector).asScratchIntervalDayTime(0)); + } else { + throw new RuntimeException("Column vector class " + sourceColVector.getClass().getName() + + " is not supported!"); + } + } + /** * Make a new (scratch) batch, which is exactly "like" the batch provided, except that it's empty * @param batch the batch to imitate @@ -635,7 +682,7 @@ public static VectorizedRowBatch makeLike(VectorizedRowBatch batch) throws HiveE VectorizedRowBatch newBatch = new VectorizedRowBatch(batch.numCols); for (int i = 0; i < batch.numCols; i++) { if (batch.cols[i] != null) { - newBatch.cols[i] = cloneColumnVector(batch.cols[i]); + newBatch.cols[i] = makeLikeColumnVector(batch.cols[i]); newBatch.cols[i].init(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorBase.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorBase.java new file mode 100644 index 0000000..a1ffd47 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorBase.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +/** + * This is the vector PTF evaluator base class. + */ +public abstract class VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorBase.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + private final WindowFrameDef windowFrameDef; + private final VectorExpression inputVecExpr; + protected final int inputColumnNum; + protected final int outputColumnNum; + + public VectorPTFEvaluatorBase(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + this.windowFrameDef = windowFrameDef; + if (inputVecExpr == null) { + inputColumnNum = -1; + this.inputVecExpr = null; + } else { + inputColumnNum = inputVecExpr.getOutputColumn(); + if (inputVecExpr instanceof IdentityExpression) { + this.inputVecExpr = null; + } else { + this.inputVecExpr = inputVecExpr; + } + } + this.outputColumnNum = outputColumnNum; + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + if (inputVecExpr != null) { + inputVecExpr.evaluate(batch); + } + } + + public abstract void evaluateEndOfGroup(int groupCount); + + public boolean streamsResult() { + // Assume by default no. + return false; + } + + public boolean isGroupValueNull() { + return false; + } + + public abstract Type getColumnVectorType(); + + public long getLongGroupValue() { + throw new RuntimeException("No long group value evaluator implementation " + this.getClass().getName()); + } + + public double getDoubleGroupValue() { + throw new RuntimeException("No double group value evaluator implementation " + this.getClass().getName()); + } + + public HiveDecimalWritable getDecimalGroupValue() { + throw new RuntimeException("No decimal group value evaluator implementation " + this.getClass().getName()); + } + + public int getOutputColumnNum() { + return outputColumnNum; + } + + public abstract void resetEvaluator(); +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorCount.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorCount.java new file mode 100644 index 0000000..5b6de8f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorCount.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates count(column) for a PTF group. + */ +public class VectorPTFEvaluatorCount extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorCount.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected long count; + + public VectorPTFEvaluatorCount(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Count non-null column rows; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + ColumnVector colVector = batch.cols[inputColumnNum]; + if (colVector.isRepeating) { + if (colVector.noNulls) { + count += size; + } + } else if (colVector.noNulls) { + count += size; + } else { + boolean[] batchIsNull = colVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + long varCount = 1; + i++; + for (; i < size; i++) { + if (!batchIsNull[i]) { + varCount++; + } + } + count += varCount; + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return false; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return count; + } + + @Override + public void resetEvaluator() { + count = 0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorCountStar.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorCountStar.java new file mode 100644 index 0000000..b8fc080 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorCountStar.java @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates count(*) for a PTF group. + */ +public class VectorPTFEvaluatorCountStar extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorCountStar.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected long count; + + public VectorPTFEvaluatorCountStar(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Nothing to do. + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + count += groupCount; + } + + @Override + public boolean isGroupValueNull() { + return false; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return count; + } + + @Override + public void resetEvaluator() { + count = 0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalAvg.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalAvg.java new file mode 100644 index 0000000..b1ce512 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalAvg.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates HiveDecimal avg() for a PTF group. + */ +public class VectorPTFEvaluatorDecimalAvg extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDecimalAvg.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected HiveDecimalWritable sum; + private int nonNullGroupCount; + private HiveDecimalWritable temp; + private HiveDecimalWritable avg; + + public VectorPTFEvaluatorDecimalAvg(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + sum = new HiveDecimalWritable(); + temp = new HiveDecimalWritable(); + avg = new HiveDecimalWritable(); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Sum double column for avg; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DecimalColumnVector decimalColVector = ((DecimalColumnVector) batch.cols[inputColumnNum]); + if (decimalColVector.isRepeating) { + if (decimalColVector.noNulls) { + temp.setFromLong(batch.size); + if (isNull) { + sum.set(decimalColVector.vector[0]); + sum.mutateMultiply(temp); + isNull = false; + } else { + temp.mutateMultiply(decimalColVector.vector[0]); + sum.mutateAdd(temp); + } + nonNullGroupCount += size; + } + } else if (decimalColVector.noNulls) { + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + sum.set(vector[0]); + isNull = false; + } else { + sum.mutateAdd(vector[0]); + } + for (int i = 1; i < size; i++) { + sum.mutateAdd(vector[i]); + } + nonNullGroupCount += size; + } else { + boolean[] batchIsNull = decimalColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + sum.set(vector[i++]); + isNull = false; + } else { + sum.mutateAdd(vector[i++]); + } + nonNullGroupCount++; + for (; i < size; i++) { + if (!batchIsNull[i]) { + sum.mutateAdd(vector[i]); + nonNullGroupCount++; + } + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + if (!isNull) { + avg.set(sum); + temp.setFromLong(nonNullGroupCount); + avg.mutateDivide(temp); + } + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DECIMAL; + } + + @Override + public HiveDecimalWritable getDecimalGroupValue() { + return avg; + } + + @Override + public void resetEvaluator() { + isNull = true; + sum.set(HiveDecimal.ZERO); + nonNullGroupCount = 0; + avg.set(HiveDecimal.ZERO); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalFirstValue.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalFirstValue.java new file mode 100644 index 0000000..ef3d725 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalFirstValue.java @@ -0,0 +1,123 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.FastHiveDecimal; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates HiveDecimal first_value() for a PTF group. + */ +public class VectorPTFEvaluatorDecimalFirstValue extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDecimalFirstValue.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean haveFirstValue; + protected boolean isNull; + protected HiveDecimalWritable firstValue; + + public VectorPTFEvaluatorDecimalFirstValue(WindowFrameDef windowFrameDef, + VectorExpression inputVecExpr, int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + firstValue = new HiveDecimalWritable(); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Capture first value; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + if (!haveFirstValue) { + final int size = batch.size; + if (size == 0) { + return; + } + DecimalColumnVector decimalColVector = ((DecimalColumnVector) batch.cols[inputColumnNum]); + if (decimalColVector.isRepeating) { + if (decimalColVector.noNulls) { + firstValue.set(decimalColVector.vector[0]); + isNull = false; + } else { + isNull = true; + } + } else if (decimalColVector.noNulls) { + firstValue.set(decimalColVector.vector[0]); + isNull = false; + } else { + if (decimalColVector.isNull[0]) { + isNull = true; + } else { + firstValue.set(decimalColVector.vector[0]); + isNull = false; + } + } + haveFirstValue = true; + } + + // First value is repeated for all batches. + DecimalColumnVector outputColVector = (DecimalColumnVector) batch.cols[outputColumnNum]; + outputColVector.isRepeating = true; + if (isNull) { + outputColVector.noNulls = false; + outputColVector.isNull[0] = true; + } else { + outputColVector.noNulls = true; + outputColVector.isNull[0] = false; + outputColVector.vector[0].set(firstValue); + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + public boolean streamsResult() { + return true; + } + + @Override + public Type getColumnVectorType() { + return Type.DECIMAL; + } + + @Override + public void resetEvaluator() { + haveFirstValue = false; + isNull = true; + firstValue.set(HiveDecimal.ZERO); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalLastValue.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalLastValue.java new file mode 100644 index 0000000..2d46c83 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalLastValue.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.FastHiveDecimal; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates HiveDecimal last_value() for a PTF group. + */ +public class VectorPTFEvaluatorDecimalLastValue extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDecimalLastValue.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected HiveDecimalWritable lastValue; + + public VectorPTFEvaluatorDecimalLastValue(WindowFrameDef windowFrameDef, + VectorExpression inputVecExpr, int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + lastValue = new HiveDecimalWritable(); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Capture last value; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + if (!isNull) { + return; + } + final int size = batch.size; + if (size == 0) { + return; + } + // Remember last value of each batch. + DecimalColumnVector decimalColVector = ((DecimalColumnVector) batch.cols[inputColumnNum]); + if (decimalColVector.isRepeating) { + if (decimalColVector.noNulls) { + lastValue.set(decimalColVector.vector[0]); + isNull = false; + } + } else if (decimalColVector.noNulls) { + lastValue.set(decimalColVector.vector[size - 1]); + isNull = false; + } else { + boolean[] batchIsNull = decimalColVector.isNull; + int i = size - 1; + while (batchIsNull[i]) { + if (--i < 0) { + return; + } + } + lastValue.set(decimalColVector.vector[i]); + isNull = false; + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DECIMAL; + } + + @Override + public HiveDecimalWritable getDecimalGroupValue() { + return lastValue; + } + + @Override + public void resetEvaluator() { + isNull = true; + lastValue.set(HiveDecimal.ZERO); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalMax.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalMax.java new file mode 100644 index 0000000..a2906af --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalMax.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.FastHiveDecimal; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates HiveDecimal max() for a PTF group. + */ +public class VectorPTFEvaluatorDecimalMax extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDecimalMax.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected HiveDecimalWritable max; + + public VectorPTFEvaluatorDecimalMax(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + max = new HiveDecimalWritable(); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Max double column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DecimalColumnVector decimalColVector = ((DecimalColumnVector) batch.cols[inputColumnNum]); + if (decimalColVector.isRepeating) { + if (decimalColVector.noNulls) { + if (isNull) { + max.set(decimalColVector.vector[0]); + isNull = false; + } else { + HiveDecimalWritable repeatedMax = decimalColVector.vector[0]; + if (repeatedMax.compareTo(max) == 1) { + max.set(repeatedMax); + } + } + } + } else if (decimalColVector.noNulls) { + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + max.set(vector[0]); + isNull = false; + } else { + final HiveDecimalWritable dec = vector[0]; + if (dec.compareTo(max) == 1) { + max.set(dec); + } + } + for (int i = 1; i < size; i++) { + final HiveDecimalWritable dec = vector[i]; + if (dec.compareTo(max) == 1) { + max.set(dec); + } + } + } else { + boolean[] batchIsNull = decimalColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + max.set(vector[i++]); + isNull = false; + } else { + final HiveDecimalWritable dec = vector[i++]; + if (dec.compareTo(max) == 1) { + max.set(dec); + } + } + for (; i < size; i++) { + if (!batchIsNull[i]) { + final HiveDecimalWritable dec = vector[i]; + if (dec.compareTo(max) == 1) { + max.set(dec); + } + } + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DECIMAL; + } + + @Override + public HiveDecimalWritable getDecimalGroupValue() { + return max; + } + + private static HiveDecimal MIN_VALUE = HiveDecimal.create("-99999999999999999999999999999999999999"); + + @Override + public void resetEvaluator() { + isNull = true; + max.set(MIN_VALUE); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalMin.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalMin.java new file mode 100644 index 0000000..5197aae --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalMin.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.FastHiveDecimal; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates HiveDecimal min() for a PTF group. + */ +public class VectorPTFEvaluatorDecimalMin extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDecimalMin.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected HiveDecimalWritable min; + + public VectorPTFEvaluatorDecimalMin(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + min = new HiveDecimalWritable(); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Min decimal column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DecimalColumnVector decimalColVector = ((DecimalColumnVector) batch.cols[inputColumnNum]); + if (decimalColVector.isRepeating) { + if (decimalColVector.noNulls) { + if (isNull) { + min.set(decimalColVector.vector[0]); + isNull = false; + } else { + HiveDecimalWritable repeatedMin = decimalColVector.vector[0]; + if (repeatedMin.compareTo(min) == -1) { + min.set(repeatedMin); + } + } + } + } else if (decimalColVector.noNulls) { + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + min.set(vector[0]); + isNull = false; + } else { + final HiveDecimalWritable dec = vector[0]; + if (dec.compareTo(min) == -1) { + min.set(dec); + } + } + for (int i = 1; i < size; i++) { + final HiveDecimalWritable dec = vector[i]; + if (dec.compareTo(min) == -1) { + min.set(dec); + } + } + } else { + boolean[] batchIsNull = decimalColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + min.set(vector[i++]); + isNull = false; + } else { + final HiveDecimalWritable dec = vector[i++]; + if (dec.compareTo(min) == -1) { + min.set(dec); + } + } + for (; i < size; i++) { + if (!batchIsNull[i]) { + final HiveDecimalWritable dec = vector[i]; + if (dec.compareTo(min) == -1) { + min.set(dec); + } + } + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DECIMAL; + } + + @Override + public HiveDecimalWritable getDecimalGroupValue() { + return min; + } + + private static HiveDecimal MAX_VALUE = HiveDecimal.create("99999999999999999999999999999999999999"); + + @Override + public void resetEvaluator() { + isNull = true; + min.set(MAX_VALUE); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalSum.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalSum.java new file mode 100644 index 0000000..91f74ae --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDecimalSum.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates HiveDecimal sum() for a PTF group. + */ +public class VectorPTFEvaluatorDecimalSum extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDecimalSum.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected HiveDecimalWritable sum; + protected HiveDecimalWritable temp; + + public VectorPTFEvaluatorDecimalSum(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + sum = new HiveDecimalWritable(); + temp = new HiveDecimalWritable(); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Sum decimal column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DecimalColumnVector decimalColVector = ((DecimalColumnVector) batch.cols[inputColumnNum]); + if (decimalColVector.isRepeating) { + if (decimalColVector.noNulls) { + temp.setFromLong(batch.size); + if (isNull) { + sum.set(decimalColVector.vector[0]); + sum.mutateMultiply(temp); + isNull = false; + } else { + temp.mutateMultiply(decimalColVector.vector[0]); + sum.mutateAdd(temp); + } + } + } else if (decimalColVector.noNulls) { + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + sum.set(vector[0]); + isNull = false; + } else { + sum.mutateAdd(vector[0]); + } + for (int i = 1; i < size; i++) { + sum.mutateAdd(vector[i]); + } + } else { + boolean[] batchIsNull = decimalColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + HiveDecimalWritable[] vector = decimalColVector.vector; + if (isNull) { + sum.set(vector[i++]); + isNull = false; + } else { + sum.mutateAdd(vector[i++]); + } + for (; i < size; i++) { + if (!batchIsNull[i]) { + sum.mutateAdd(vector[i]); + } + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DECIMAL; + } + + @Override + public HiveDecimalWritable getDecimalGroupValue() { + return sum; + } + + @Override + public void resetEvaluator() { + isNull = true; + sum.set(HiveDecimal.ZERO);; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDenseRank.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDenseRank.java new file mode 100644 index 0000000..e417418 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDenseRank.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +/** + * This class evaluates rank() for a PTF group. + */ +public class VectorPTFEvaluatorDenseRank extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDenseRank.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + private int denseRank; + private int groupDenseRank; + + public VectorPTFEvaluatorDenseRank(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Nothing to do. + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + groupDenseRank = denseRank; + denseRank++; + } + + public boolean isGroupValueNull() { + return false; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return groupDenseRank; + } + + @Override + public void resetEvaluator() { + denseRank = 1; + groupDenseRank = 0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleAvg.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleAvg.java new file mode 100644 index 0000000..d07c492 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleAvg.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates double avg() for a PTF group. + */ +public class VectorPTFEvaluatorDoubleAvg extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDoubleAvg.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected double sum; + private int nonNullGroupCount; + private double avg; + + public VectorPTFEvaluatorDoubleAvg(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Sum double column for avg; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DoubleColumnVector doubleColVector = ((DoubleColumnVector) batch.cols[inputColumnNum]); + if (doubleColVector.isRepeating) { + if (doubleColVector.noNulls) { + if (isNull) { + sum = doubleColVector.vector[0] * batch.size; + isNull = false; + } else { + sum += doubleColVector.vector[0] * batch.size; + } + nonNullGroupCount += size; + } + } else if (doubleColVector.noNulls) { + double[] vector = doubleColVector.vector; + double varSum = vector[0]; + for (int i = 1; i < size; i++) { + varSum += vector[i]; + } + nonNullGroupCount += size; + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } else { + boolean[] batchIsNull = doubleColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + double[] vector = doubleColVector.vector; + double varSum = vector[i++]; + nonNullGroupCount++; + for (; i < size; i++) { + if (!batchIsNull[i]) { + varSum += vector[i]; + nonNullGroupCount++; + } + } + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + if (!isNull) { + avg = sum / nonNullGroupCount; + } + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DOUBLE; + } + + @Override + public double getDoubleGroupValue() { + return avg; + } + + @Override + public void resetEvaluator() { + isNull = true; + sum = 0.0; + nonNullGroupCount = 0; + avg = 0.0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleFirstValue.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleFirstValue.java new file mode 100644 index 0000000..d269e40 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleFirstValue.java @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates double first_value() for a PTF group. + */ +public class VectorPTFEvaluatorDoubleFirstValue extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDoubleFirstValue.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean haveFirstValue; + protected boolean isNull; + protected double firstValue; + + public VectorPTFEvaluatorDoubleFirstValue(WindowFrameDef windowFrameDef, + VectorExpression inputVecExpr, int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Max double column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + if (!haveFirstValue) { + final int size = batch.size; + if (size == 0) { + return; + } + DoubleColumnVector doubleColVector = ((DoubleColumnVector) batch.cols[inputColumnNum]); + if (doubleColVector.isRepeating) { + if (doubleColVector.noNulls) { + firstValue = doubleColVector.vector[0]; + isNull = false; + } else { + isNull = true; + } + } else if (doubleColVector.noNulls) { + firstValue = doubleColVector.vector[0]; + isNull = false; + } else { + if (doubleColVector.isNull[0]) { + isNull = true; + } else { + firstValue = doubleColVector.vector[0]; + isNull = false; + } + } + haveFirstValue = true; + } + + // First value is repeated for all batches. + DoubleColumnVector outputColVector = (DoubleColumnVector) batch.cols[outputColumnNum]; + outputColVector.isRepeating = true; + if (isNull) { + outputColVector.noNulls = false; + outputColVector.isNull[0] = true; + } else { + outputColVector.noNulls = true; + outputColVector.isNull[0] = false; + outputColVector.vector[0] = firstValue; + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + public boolean streamsResult() { + return true; + } + + @Override + public Type getColumnVectorType() { + return Type.DOUBLE; + } + + @Override + public void resetEvaluator() { + haveFirstValue = false; + isNull = true; + firstValue = 0.0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleLastValue.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleLastValue.java new file mode 100644 index 0000000..0daff19 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleLastValue.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates double first_value() for a PTF group. + */ +public class VectorPTFEvaluatorDoubleLastValue extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDoubleLastValue.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected double lastValue; + + public VectorPTFEvaluatorDoubleLastValue(WindowFrameDef windowFrameDef, + VectorExpression inputVecExpr, int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Capture last value; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + // Remember last value of each batch. + DoubleColumnVector doubleColVector = ((DoubleColumnVector) batch.cols[inputColumnNum]); + if (doubleColVector.isRepeating) { + if (doubleColVector.noNulls) { + lastValue = doubleColVector.vector[0]; + isNull = false; + } + } else if (doubleColVector.noNulls) { + lastValue = doubleColVector.vector[size - 1]; + isNull = false; + } else { + boolean[] batchIsNull = doubleColVector.isNull; + int i = size - 1; + while (batchIsNull[i]) { + if (--i < 0) { + return; + } + } + lastValue = doubleColVector.vector[i]; + isNull = false; + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DOUBLE; + } + + @Override + public double getDoubleGroupValue() { + return lastValue; + } + + @Override + public void resetEvaluator() { + isNull = true; + lastValue = 0.0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleMax.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleMax.java new file mode 100644 index 0000000..044b740 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleMax.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates double max() for a PTF group. + */ +public class VectorPTFEvaluatorDoubleMax extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDoubleMax.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected double max; + + public VectorPTFEvaluatorDoubleMax(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Max double column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DoubleColumnVector doubleColVector = ((DoubleColumnVector) batch.cols[inputColumnNum]); + if (doubleColVector.isRepeating) { + if (doubleColVector.noNulls) { + if (isNull) { + max = doubleColVector.vector[0]; + isNull = false; + } else { + final double repeatedMax = doubleColVector.vector[0]; + if (repeatedMax < max) { + max = repeatedMax; + } + } + } + } else if (doubleColVector.noNulls) { + double[] vector = doubleColVector.vector; + double varMax = vector[0]; + for (int i = 1; i < size; i++) { + final double d = vector[i]; + if (d > varMax) { + varMax = d; + } + } + if (isNull) { + max = varMax; + isNull = false; + } else if (varMax > max) { + max = varMax; + } + } else { + boolean[] batchIsNull = doubleColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + double[] vector = doubleColVector.vector; + double varMax = vector[i++]; + for (; i < size; i++) { + if (!batchIsNull[i]) { + final double d = vector[i]; + if (d > varMax) { + varMax = d; + } + } + } + if (isNull) { + max = varMax; + isNull = false; + } else if (varMax > max) { + max = varMax; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DOUBLE; + } + + @Override + public double getDoubleGroupValue() { + return max; + } + + @Override + public void resetEvaluator() { + isNull = true; + max = Double.MIN_VALUE; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleMin.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleMin.java new file mode 100644 index 0000000..03eec42 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleMin.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates double min() for a PTF group. + */ +public class VectorPTFEvaluatorDoubleMin extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDoubleMin.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected double min; + + public VectorPTFEvaluatorDoubleMin(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Min double column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DoubleColumnVector doubleColVector = ((DoubleColumnVector) batch.cols[inputColumnNum]); + if (doubleColVector.isRepeating) { + if (doubleColVector.noNulls) { + if (isNull) { + min = doubleColVector.vector[0]; + isNull = false; + } else { + final double repeatedMin = doubleColVector.vector[0]; + if (repeatedMin < min) { + min = repeatedMin; + } + } + } + } else if (doubleColVector.noNulls) { + double[] vector = doubleColVector.vector; + double varMin = vector[0]; + for (int i = 1; i < size; i++) { + final double d = vector[i]; + if (d < varMin) { + varMin = d; + } + } + if (isNull) { + min = varMin; + isNull = false; + } else if (varMin < min) { + min = varMin; + } + } else { + boolean[] batchIsNull = doubleColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + double[] vector = doubleColVector.vector; + double varMin = vector[i++]; + for (; i < size; i++) { + if (!batchIsNull[i]) { + final double d = vector[i]; + if (d < varMin) { + varMin = d; + } + } + } + if (isNull) { + min = varMin; + isNull = false; + } else if (varMin < min) { + min = varMin; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DOUBLE; + } + + @Override + public double getDoubleGroupValue() { + return min; + } + + @Override + public void resetEvaluator() { + isNull = true; + min = Double.MAX_VALUE; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleSum.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleSum.java new file mode 100644 index 0000000..7387f9e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorDoubleSum.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates double sum() for a PTF group. + */ +public class VectorPTFEvaluatorDoubleSum extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorDoubleSum.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected double sum; + + public VectorPTFEvaluatorDoubleSum(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Sum double column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + DoubleColumnVector doubleColVector = ((DoubleColumnVector) batch.cols[inputColumnNum]); + if (doubleColVector.isRepeating) { + if (doubleColVector.noNulls) { + if (isNull) { + sum = doubleColVector.vector[0] * batch.size; + isNull = false; + } else { + sum += doubleColVector.vector[0] * batch.size; + } + } + } else if (doubleColVector.noNulls) { + double[] vector = doubleColVector.vector; + double varSum = vector[0]; + for (int i = 1; i < size; i++) { + varSum += vector[i]; + } + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } else { + boolean[] batchIsNull = doubleColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + double[] vector = doubleColVector.vector; + double varSum = vector[i++]; + for (; i < size; i++) { + if (!batchIsNull[i]) { + varSum += vector[i]; + } + } + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DOUBLE; + } + + @Override + public double getDoubleGroupValue() { + return sum; + } + + @Override + public void resetEvaluator() { + isNull = true; + sum = 0.0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongAvg.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongAvg.java new file mode 100644 index 0000000..860f434 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongAvg.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates long avg() for a PTF group. + */ +public class VectorPTFEvaluatorLongAvg extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorLongAvg.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected long sum; + private int nonNullGroupCount; + private double avg; + + public VectorPTFEvaluatorLongAvg(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Sum long column for avg; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + LongColumnVector longColVector = ((LongColumnVector) batch.cols[inputColumnNum]); + if (longColVector.isRepeating) { + if (longColVector.noNulls) { + if (isNull) { + sum = longColVector.vector[0] * batch.size; + isNull = false; + } else { + sum += longColVector.vector[0] * batch.size; + } + nonNullGroupCount += size; + } + } else if (longColVector.noNulls) { + long[] vector = longColVector.vector; + long varSum = vector[0]; + for (int i = 1; i < size; i++) { + varSum += vector[i]; + } + nonNullGroupCount += size; + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } else { + boolean[] batchIsNull = longColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + long[] vector = longColVector.vector; + long varSum = vector[i++]; + nonNullGroupCount++; + for (; i < size; i++) { + if (!batchIsNull[i]) { + varSum += vector[i]; + nonNullGroupCount++; + } + } + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + if (!isNull) { + avg = ((double) sum) / nonNullGroupCount; + } + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.DOUBLE; + } + + @Override + public double getDoubleGroupValue() { + return avg; + } + + @Override + public void resetEvaluator() { + isNull = true; + sum = 0; + nonNullGroupCount = 0; + avg = 0.0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongFirstValue.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongFirstValue.java new file mode 100644 index 0000000..9beea8f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongFirstValue.java @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates long first_value() for a PTF group. + */ +public class VectorPTFEvaluatorLongFirstValue extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorLongFirstValue.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean haveFirstValue; + protected boolean isNull; + protected long firstValue; + + public VectorPTFEvaluatorLongFirstValue(WindowFrameDef windowFrameDef, + VectorExpression inputVecExpr, int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Capture first long; maintain isNull; stream fill result as repeated. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + if (!haveFirstValue) { + final int size = batch.size; + if (size == 0) { + return; + } + LongColumnVector longColVector = ((LongColumnVector) batch.cols[inputColumnNum]); + if (longColVector.isRepeating) { + if (longColVector.noNulls) { + firstValue = longColVector.vector[0]; + isNull = false; + } else { + isNull = true; + } + } else if (longColVector.noNulls) { + firstValue = longColVector.vector[0]; + isNull = false; + } else { + if (longColVector.isNull[0]) { + isNull = true; + } else { + firstValue = longColVector.vector[0]; + isNull = false; + } + } + haveFirstValue = true; + } + + // First value is repeated for all batches. + LongColumnVector outputColVector = (LongColumnVector) batch.cols[outputColumnNum]; + outputColVector.isRepeating = true; + if (isNull) { + outputColVector.noNulls = false; + outputColVector.isNull[0] = true; + } else { + outputColVector.noNulls = true; + outputColVector.isNull[0] = false; + outputColVector.vector[0] = firstValue; + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + public boolean streamsResult() { + return true; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public void resetEvaluator() { + haveFirstValue = false; + isNull = true; + firstValue = 0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongLastValue.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongLastValue.java new file mode 100644 index 0000000..7bd4a9e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongLastValue.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates long first_value() for a PTF group. + */ +public class VectorPTFEvaluatorLongLastValue extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorLongLastValue.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected long lastValue; + + public VectorPTFEvaluatorLongLastValue(WindowFrameDef windowFrameDef, + VectorExpression inputVecExpr, int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Capture last long; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + // Remember last value of each batch. + LongColumnVector longColVector = ((LongColumnVector) batch.cols[inputColumnNum]); + if (longColVector.isRepeating) { + if (longColVector.noNulls) { + lastValue = longColVector.vector[0]; + isNull = false; + } + } else if (longColVector.noNulls) { + lastValue = longColVector.vector[size - 1]; + isNull = false; + } else { + boolean[] batchIsNull = longColVector.isNull; + int i = size - 1; + while (batchIsNull[i]) { + if (--i < 0) { + return; + } + } + lastValue = longColVector.vector[i]; + isNull = false; + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return lastValue; + } + + @Override + public void resetEvaluator() { + isNull = true; + lastValue = 0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongMax.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongMax.java new file mode 100644 index 0000000..bf90773 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongMax.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates long max() for a PTF group. + */ +public class VectorPTFEvaluatorLongMax extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorLongMax.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected long max; + + public VectorPTFEvaluatorLongMax(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Max long column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + LongColumnVector longColVector = ((LongColumnVector) batch.cols[inputColumnNum]); + if (longColVector.isRepeating) { + if (longColVector.noNulls) { + if (isNull) { + max = longColVector.vector[0]; + isNull = false; + } else { + final long repeatedMax = longColVector.vector[0]; + if (repeatedMax > max) { + max = repeatedMax; + } + } + } + } else if (longColVector.noNulls) { + long[] vector = longColVector.vector; + long varMax = vector[0]; + for (int i = 1; i < size; i++) { + final long l = vector[i]; + if (l > varMax) { + varMax = l; + } + } + if (isNull) { + max = varMax; + isNull = false; + } else if (varMax > max) { + max = varMax; + } + } else { + boolean[] batchIsNull = longColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + long[] vector = longColVector.vector; + long varMax = vector[i++]; + for (; i < size; i++) { + if (!batchIsNull[i]) { + final long l = vector[i]; + if (l > varMax) { + varMax = l; + } + } + } + if (isNull) { + max = varMax; + isNull = false; + } else if (varMax > max) { + max = varMax; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return max; + } + + @Override + public void resetEvaluator() { + isNull = true; + max = Long.MIN_VALUE; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongMin.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongMin.java new file mode 100644 index 0000000..f2513fd --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongMin.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates long min() for a PTF group. + */ +public class VectorPTFEvaluatorLongMin extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorLongMin.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected long min; + + public VectorPTFEvaluatorLongMin(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Min long column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + LongColumnVector longColVector = ((LongColumnVector) batch.cols[inputColumnNum]); + if (longColVector.isRepeating) { + if (longColVector.noNulls) { + if (isNull) { + min = longColVector.vector[0]; + isNull = false; + } else { + final long repeatedMin = longColVector.vector[0]; + if (repeatedMin < min) { + min = repeatedMin; + } + } + } + } else if (longColVector.noNulls) { + long[] vector = longColVector.vector; + long varMin = vector[0]; + for (int i = 1; i < size; i++) { + final long l = vector[i]; + if (l < varMin) { + varMin = l; + } + } + if (isNull) { + min = varMin; + isNull = false; + } else if (varMin < min) { + min = varMin; + } + } else { + boolean[] batchIsNull = longColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + long[] vector = longColVector.vector; + long varMin = vector[i++]; + for (; i < size; i++) { + if (!batchIsNull[i]) { + final long l = vector[i]; + if (l < varMin) { + varMin = l; + } + } + } + if (isNull) { + min = varMin; + isNull = false; + } else if (varMin < min) { + min = varMin; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return min; + } + + @Override + public void resetEvaluator() { + isNull = true; + min = Long.MAX_VALUE; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongSum.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongSum.java new file mode 100644 index 0000000..5bfb93f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorLongSum.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +import com.google.common.base.Preconditions; + +/** + * This class evaluates long sum() for a PTF group. + */ +public class VectorPTFEvaluatorLongSum extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorLongSum.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + protected boolean isNull; + protected long sum; + + public VectorPTFEvaluatorLongSum(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Sum long column; maintain isNull. + + // We do not filter when PTF is in reducer. + Preconditions.checkState(!batch.selectedInUse); + + final int size = batch.size; + if (size == 0) { + return; + } + LongColumnVector longColVector = ((LongColumnVector) batch.cols[inputColumnNum]); + if (longColVector.isRepeating) { + if (longColVector.noNulls) { + if (isNull) { + sum = longColVector.vector[0] * batch.size; + isNull = false; + } else { + sum += longColVector.vector[0] * batch.size; + } + } + } else if (longColVector.noNulls) { + long[] vector = longColVector.vector; + long varSum = vector[0]; + for (int i = 1; i < size; i++) { + varSum += vector[i]; + } + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } else { + boolean[] batchIsNull = longColVector.isNull; + int i = 0; + while (batchIsNull[i]) { + if (++i >= size) { + return; + } + } + long[] vector = longColVector.vector; + long varSum = vector[i++]; + for (; i < size; i++) { + if (!batchIsNull[i]) { + varSum += vector[i]; + } + } + if (isNull) { + sum = varSum; + isNull = false; + } else { + sum += varSum; + } + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do. + } + + @Override + public boolean isGroupValueNull() { + return isNull; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return sum; + } + + @Override + public void resetEvaluator() { + isNull = true; + sum = 0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorRank.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorRank.java new file mode 100644 index 0000000..f5727ec --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorRank.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +/** + * This class evaluates rank() for a PTF group. + */ +public class VectorPTFEvaluatorRank extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorRank.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + private int rank; + private int groupRank; + + public VectorPTFEvaluatorRank(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + // Nothing to do. + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + groupRank = rank; + rank += groupCount; + } + + public boolean isGroupValueNull() { + return false; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public long getLongGroupValue() { + return groupRank; + } + + @Override + public void resetEvaluator() { + rank = 1; + groupRank = 0; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorRowNumber.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorRowNumber.java new file mode 100644 index 0000000..942f043 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFEvaluatorRowNumber.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; + +/** + * This class evaluates row_number() for a PTF group. + */ +public class VectorPTFEvaluatorRowNumber extends VectorPTFEvaluatorBase { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFEvaluatorRowNumber.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + private int rowNumber; + + public VectorPTFEvaluatorRowNumber(WindowFrameDef windowFrameDef, VectorExpression inputVecExpr, + int outputColumnNum) { + super(windowFrameDef, inputVecExpr, outputColumnNum); + resetEvaluator(); + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + super.evaluateGroupBatch(batch); + + final int size = batch.size; + LongColumnVector longColVector = (LongColumnVector) batch.cols[outputColumnNum]; + long[] vector = longColVector.vector; + for (int i = 0; i < size; i++) { + vector[i] = rowNumber++; + } + } + + @Override + public void evaluateEndOfGroup(int groupCount) { + // Nothing to do -- we stream the row number in with evaluateGroupBatch. + } + + public boolean streamsResult() { + // No group value. + return true; + } + + public boolean isGroupValueNull() { + return false; + } + + @Override + public Type getColumnVectorType() { + return Type.LONG; + } + + @Override + public void resetEvaluator() { + rowNumber = 1; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFGroupBatches.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFGroupBatches.java new file mode 100644 index 0000000..f26725b --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFGroupBatches.java @@ -0,0 +1,225 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import java.util.ArrayList; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import com.google.common.base.Preconditions; + +/** + * This class is encapsulates one or more VectorizedRowBatch of a PTF group. + */ +public class VectorPTFGroupBatches { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFGroupBatches.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + private int groupRowCount; + + private VectorPTFEvaluatorBase[] evaluators; + private int[] outputColumnMap; + private int[] keyInputColumnMap; + private int[] bufferedColumnMap; + + private ArrayList bufferedBatches; + + private VectorizedRowBatch overflowBatch; + + private int allocatedBufferedBatchCount; + private int currentBufferedBatchCount; + + public VectorPTFGroupBatches() { + groupRowCount = 0; + allocatedBufferedBatchCount = 0; + currentBufferedBatchCount = 0; + } + + public void init(VectorPTFEvaluatorBase[] evaluators, int[] outputColumnMap, + int[] keyInputColumnMap, int[] nonKeyInputColumnMap, int[] streamingColumnMap, + VectorizedRowBatch overflowBatch) { + this.evaluators = evaluators; + this.outputColumnMap = outputColumnMap; + this.keyInputColumnMap = keyInputColumnMap; + final int nonKeyInputColumnCount = nonKeyInputColumnMap.length; + final int streamingColumnCount = streamingColumnMap.length; + final int bufferedColumnCount = nonKeyInputColumnCount + streamingColumnCount; + bufferedColumnMap = new int[bufferedColumnCount]; + for (int i = 0; i < nonKeyInputColumnCount; i++) { + bufferedColumnMap[i] = nonKeyInputColumnMap[i]; + } + for (int i = nonKeyInputColumnCount; i < bufferedColumnCount; i++) { + bufferedColumnMap[i] = streamingColumnMap[i - nonKeyInputColumnCount]; + } + this.overflowBatch = overflowBatch; + bufferedBatches = new ArrayList(0); + } + + public void evaluateStreamingGroupBatch(VectorizedRowBatch batch) { + // Streaming evaluators fill in their results now. + for (VectorPTFEvaluatorBase evaluator : evaluators) { + evaluator.evaluateGroupBatch(batch); + } + groupRowCount = 0; + } + + public void evaluateGroupBatch(VectorizedRowBatch batch) { + for (VectorPTFEvaluatorBase evaluator : evaluators) { + evaluator.evaluateGroupBatch(batch); + } + groupRowCount += batch.size; + } + + private void fillGroupResults(VectorizedRowBatch batch) { + for (VectorPTFEvaluatorBase evaluator : evaluators) { + final int outputColumnNum = evaluator.getOutputColumnNum(); + if (evaluator.streamsResult()) { + continue; + } + final ColumnVector outputColVector = batch.cols[outputColumnNum]; + outputColVector.isRepeating = true; + final boolean isGroupValueNull = evaluator.isGroupValueNull(); + outputColVector.isNull[0] = isGroupValueNull; + if (isGroupValueNull) { + outputColVector.noNulls = false; + } else { + outputColVector.noNulls = true; + switch (evaluator.getColumnVectorType()) { + case LONG: + ((LongColumnVector) outputColVector).vector[0] = evaluator.getLongGroupValue(); + break; + case DOUBLE: + ((DoubleColumnVector) outputColVector).vector[0] = evaluator.getDoubleGroupValue(); + break; + case DECIMAL: + ((DecimalColumnVector) outputColVector).vector[0].set(evaluator.getDecimalGroupValue()); + break; + default: + throw new RuntimeException("Unexpected column vector type " + evaluator.getColumnVectorType()); + } + } + } + } + + private void forwardBufferedBatches(VectorPTFOperator vecPTFOperator, int index) + throws HiveException { + VectorizedRowBatch bufferedBatch = bufferedBatches.get(index); + + final int size = bufferedColumnMap.length; + for (int i = 0; i < size; i++) { + + // Swap ColumnVectors with overflowBatch. We remember buffered columns compactly in the + // buffered VRBs without other columns or scratch columns. + VectorizedBatchUtil.swapColumnVector( + bufferedBatch, i, overflowBatch, bufferedColumnMap[i]); + + overflowBatch.size = bufferedBatch.size; + fillGroupResults(overflowBatch); + vecPTFOperator.forward(overflowBatch, null); + } + } + + public void fillGroupResultsAndForward(VectorPTFOperator vecPTFOperator, + VectorizedRowBatch lastBatch) throws HiveException { + for (VectorPTFEvaluatorBase evaluator : evaluators) { + evaluator.evaluateEndOfGroup(groupRowCount); + } + groupRowCount = 0; + + if (currentBufferedBatchCount > 0) { + + // Set partition and order columns in overflowBatch. + // We can set by ref since our last batch is held by us. + final int keyInputColumnCount = keyInputColumnMap.length; + for (int i = 0; i < keyInputColumnCount; i++) { + VectorizedBatchUtil.copyRepeatingColumn(lastBatch, i, overflowBatch, i, /* setByValue */ false); + } + + for (int i = 0; i < currentBufferedBatchCount; i++) { + forwardBufferedBatches(vecPTFOperator, i); + } + currentBufferedBatchCount = 0; + } + + fillGroupResults(lastBatch); + + // Save original projection. + int[] originalProjections = lastBatch.projectedColumns; + int originalProjectionSize = lastBatch.projectionSize; + + // Project with the output of our operator. + lastBatch.projectionSize = outputColumnMap.length; + lastBatch.projectedColumns = outputColumnMap; + + vecPTFOperator.forward(lastBatch, null); + + // Revert the projected columns back, because batch can be re-used by our parent operators. + lastBatch.projectionSize = originalProjectionSize; + lastBatch.projectedColumns = originalProjections; + + } + + public void resetEvaluators() { + for (VectorPTFEvaluatorBase evaluator : evaluators) { + evaluator.resetEvaluator(); + } + Preconditions.checkState(groupRowCount == 0); + } + + private VectorizedRowBatch newBufferedBatch(VectorizedRowBatch batch) throws HiveException { + final int bufferedColumnCount = bufferedColumnMap.length; + VectorizedRowBatch newBatch = new VectorizedRowBatch(bufferedColumnCount); + for (int i = 0; i < bufferedColumnCount; i++) { + newBatch.cols[i] = + VectorizedBatchUtil.makeLikeColumnVector(batch.cols[bufferedColumnMap[i]]); + newBatch.cols[i].init(); + } + return newBatch; + } + + public void bufferGroupBatch(VectorizedRowBatch batch) throws HiveException { + + final int bufferedColumnCount = bufferedColumnMap.length; + if (allocatedBufferedBatchCount <= currentBufferedBatchCount) { + VectorizedRowBatch newBatch = newBufferedBatch(batch); + bufferedBatches.add(newBatch); + allocatedBufferedBatchCount++; + } + + VectorizedRowBatch bufferedBatch = bufferedBatches.get(currentBufferedBatchCount++); + + for (int i = 0; i < bufferedColumnCount; i++) { + VectorizedBatchUtil.swapColumnVector( + batch, bufferedColumnMap[i], bufferedBatch, i); + } + + bufferedBatch.size = batch.size; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFOperator.java new file mode 100644 index 0000000..4155443 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ptf/VectorPTFOperator.java @@ -0,0 +1,571 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.ptf; + +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.lang.ArrayUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PTFDesc; +import org.apache.hadoop.hive.ql.plan.VectorPTFDesc; +import org.apache.hadoop.hive.ql.plan.VectorPTFDesc.SupportedFunctionType; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.ql.plan.VectorPTFInfo; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * This class is native vectorized PTF operator class. + */ +public class VectorPTFOperator extends Operator + implements VectorizationContextRegion { + + private static final long serialVersionUID = 1L; + private static final String CLASS_NAME = VectorPTFOperator.class.getName(); + private static final Log LOG = LogFactory.getLog(CLASS_NAME); + + private VectorPTFDesc vectorDesc; + + /** + * Information about our native vectorized PTF created by the Vectorizer class during + * it decision process and useful for execution. + */ + private VectorPTFInfo vectorPTFInfo; + + private VectorizationContext vContext; + + // This is the vectorized row batch description of the output of the native vectorized PTF + // operator. It is based on the incoming vectorization context. Its projection may include + // a mixture of input columns and new scratch columns (for the aggregation output). + protected VectorizationContext vOutContext; + + private boolean isPartitionOrderBy; + + /** + * PTF vector expressions. + */ + + // This is map of which vectorized row batch columns are the input columns and the group value + // (aggregation) output columns. + // And, their types. + private int[] outputColumnMap; + private String[] outputColumnNames; + private TypeInfo[] outputTypeInfos; + + private int evaluatorCount; + private String[] evaluatorFunctionNames; + private WindowFrameDef[] evaluatorWindowFrameDefs; + private ExprNodeDesc[] evaluatorInputExprNodeDescs; + private VectorExpression[] evaluatorInputExpressions; + private Type[] evaluatorInputColumnVectorTypes; + + private ExprNodeDesc[] orderExprNodeDescs; + private int[] orderColumnMap; + private Type[] orderColumnVectorTypes; + private VectorExpression[] orderExpressions; + + private ExprNodeDesc[] partitionExprNodeDescs; + private int[] partitionColumnMap; + private Type[] partitionColumnVectorTypes; + private VectorExpression[] partitionExpressions; + + private int[] keyInputColumnMap; + private int[] nonKeyInputColumnMap; + + // The above members are initialized by the constructor and must not be + // transient. + //--------------------------------------------------------------------------- + + private transient boolean isLastGroupBatch; + + private transient VectorizedRowBatch overflowBatch; + + private transient VectorPTFGroupBatches groupBatches; + + private transient VectorPTFEvaluatorBase[] evaluators; + + private transient int[] streamingColumnMap; + + private transient boolean allEvaluatorsAreStreaming; + + private transient boolean isFirstPartition; + + private transient boolean[] currentPartitionIsNull; + private transient long[] currentPartitionLongs; + private transient double[] currentPartitionDoubles; + private transient byte[][] currentPartitionByteArrays; + private transient int[] currentPartitionByteLengths; + private transient HiveDecimalWritable[] currentPartitionDecimals; + private transient Timestamp[] currentPartitionTimestamps; + private transient HiveIntervalDayTime[] currentPartitionIntervalDayTimes; + + // For debug tracing: the name of the map or reduce task. + private transient String taskName; + + // Debug display. + private transient long batchCounter; + + //--------------------------------------------------------------------------- + + /** Kryo ctor. */ + protected VectorPTFOperator() { + super(); + } + + public VectorPTFOperator(CompilationOpContext ctx) { + super(ctx); + } + + public VectorPTFOperator(CompilationOpContext ctx, + VectorizationContext vContext, OperatorDesc conf) throws HiveException { + this(ctx); + + LOG.info("VectorPTF constructor"); + + PTFDesc desc = (PTFDesc) conf; + this.conf = desc; + vectorDesc = (VectorPTFDesc) desc.getVectorDesc(); + vectorPTFInfo = vectorDesc.getVectorPTFInfo(); + this.vContext = vContext; + + isPartitionOrderBy = vectorDesc.getIsPartitionOrderBy(); + + outputColumnMap = vectorPTFInfo.getOutputColumnMap(); + outputColumnNames = vectorPTFInfo.getOutputColumnNames(); + outputTypeInfos = vectorPTFInfo.getOutputTypeInfos(); + + /* + * Create a new vectorization context to create a new projection, but keep + * same output column manager must be inherited to track the scratch the columns. + */ + vOutContext = new VectorizationContext(getName(), this.vContext); + setupVOutContext(); + + evaluatorFunctionNames = vectorDesc.getEvaluatorFunctionNames(); + evaluatorCount = evaluatorFunctionNames.length; + evaluatorWindowFrameDefs = vectorDesc.getEvaluatorWindowFrameDefs(); + evaluatorInputExprNodeDescs = vectorDesc.getEvaluatorInputExprNodeDescs(); + evaluatorInputExpressions = vectorPTFInfo.getEvaluatorInputExpressions(); + evaluatorInputColumnVectorTypes = vectorPTFInfo.getEvaluatorInputColumnVectorTypes(); + + orderExprNodeDescs = vectorDesc.getOrderExprNodeDescs(); + orderColumnMap = vectorPTFInfo.getOrderColumnMap(); + orderColumnVectorTypes = vectorPTFInfo.getOrderColumnVectorTypes(); + orderExpressions = vectorPTFInfo.getOrderExpressions(); + + partitionExprNodeDescs = vectorDesc.getPartitionExprNodeDescs(); + partitionColumnMap = vectorPTFInfo.getPartitionColumnMap(); + partitionColumnVectorTypes = vectorPTFInfo.getPartitionColumnVectorTypes(); + partitionExpressions = vectorPTFInfo.getPartitionExpressions(); + + keyInputColumnMap = vectorPTFInfo.getKeyInputColumnMap(); + nonKeyInputColumnMap = vectorPTFInfo.getNonKeyInputColumnMap(); + } + + /** + * Setup the vectorized row batch description of the output of the native vectorized PTF + * operator. Use the output projection we previously built from a mixture of input + * columns and new scratch columns. + */ + protected void setupVOutContext() { + vOutContext.resetProjectionColumns(); + final int count = outputColumnNames.length; + for (int i = 0; i < count; ++i) { + String columnName = outputColumnNames[i]; + int outputColumn = outputColumnMap[i]; + vOutContext.addProjectionColumn(columnName, outputColumn); + } + } + + /* + * Allocate overflow batch columns by hand. + */ + private void allocateOverflowBatchColumnVector(VectorizedRowBatch overflowBatch, int outputColumn, + String typeName) throws HiveException { + + if (overflowBatch.cols[outputColumn] == null) { + typeName = VectorizationContext.mapTypeNameSynonyms(typeName); + + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + + overflowBatch.cols[outputColumn] = VectorizedBatchUtil.createColumnVector(typeInfo); + } + } + + /* + * Setup our 2nd batch with the same "column schema" as the output columns plus any scratch + * columns since the overflow batch will get forwarded to children operators. + */ + protected VectorizedRowBatch setupOverflowBatch() throws HiveException { + + int initialColumnCount = vContext.firstOutputColumnIndex(); + VectorizedRowBatch overflowBatch; + + int totalNumColumns = initialColumnCount + vOutContext.getScratchColumnTypeNames().length; + overflowBatch = new VectorizedRowBatch(totalNumColumns); + + // First, just allocate just the output columns we will be using. + for (int i = 0; i < outputColumnMap.length; i++) { + int outputColumn = outputColumnMap[i]; + String typeName = outputTypeInfos[i].getTypeName(); + allocateOverflowBatchColumnVector(overflowBatch, outputColumn, typeName); + } + + // Now, add any scratch columns needed for children operators. + int outputColumn = initialColumnCount; + for (String typeName : vOutContext.getScratchColumnTypeNames()) { + allocateOverflowBatchColumnVector(overflowBatch, outputColumn++, typeName); + } + + overflowBatch.projectedColumns = outputColumnMap; + overflowBatch.projectionSize = outputColumnMap.length; + + overflowBatch.reset(); + + return overflowBatch; + } + + @Override + protected void initializeOp(Configuration hconf) throws HiveException { + super.initializeOp(hconf); + + if (LOG.isDebugEnabled()) { + // Determine the name of our map or reduce task for debug tracing. + BaseWork work = Utilities.getMapWork(hconf); + if (work == null) { + work = Utilities.getReduceWork(hconf); + } + taskName = work.getName(); + } + + if (!isPartitionOrderBy) { + currentPartitionIsNull = null; + currentPartitionLongs = null; + currentPartitionDoubles = null; + currentPartitionByteArrays = null; + currentPartitionByteLengths = null; + currentPartitionDecimals = null; + currentPartitionTimestamps = null; + currentPartitionIntervalDayTimes = null; + } else { + final int partitionKeyCount = vectorDesc.getPartitionExprNodeDescs().length; + currentPartitionIsNull = new boolean[partitionKeyCount]; + currentPartitionLongs = new long[partitionKeyCount]; + currentPartitionDoubles = new double[partitionKeyCount]; + currentPartitionByteArrays = new byte[partitionKeyCount][]; + currentPartitionByteLengths = new int[partitionKeyCount]; + currentPartitionDecimals = new HiveDecimalWritable[partitionKeyCount]; + currentPartitionTimestamps = new Timestamp[partitionKeyCount]; + currentPartitionIntervalDayTimes = new HiveIntervalDayTime[partitionKeyCount]; + } + + evaluators = VectorPTFDesc.getEvaluators(vectorDesc, vectorPTFInfo); + + streamingColumnMap = VectorPTFDesc.getStreamingColumnMap(evaluators); + + allEvaluatorsAreStreaming = (streamingColumnMap.length == evaluatorCount); + + /* + * Setup the overflow batch. + */ + overflowBatch = setupOverflowBatch(); + + groupBatches = new VectorPTFGroupBatches(); + groupBatches.init( + evaluators, outputColumnMap, keyInputColumnMap, nonKeyInputColumnMap, streamingColumnMap, overflowBatch); + + isFirstPartition = true; + + batchCounter = 0; + } + + @Override + public void setNextVectorBatchGroupStatus(boolean isLastGroupBatch) throws HiveException { + this.isLastGroupBatch = isLastGroupBatch; + } + + /** + * We are processing a batch from reduce processor that is only for one group (i.e. reducer) key. + * + * For a simple OVER (PARTITION BY column) or OVER (ORDER BY column), the reduce processor's + * group key is the partition or order by key. + * + * For an OVER (PARTITION BY column1, ORDER BY column2), the reduce-shuffle group key is + * the combination of the partition column1 and the order by column2. In this case, this method + * has to watch for changes in the partition and reset the group aggregations. + * + * The reduce processor calls setNextVectorBatchGroupStatus beforehand to tell us whether the + * batch supplied to our process method is the last batch for the group key, or not. + */ + @Override + public void process(Object row, int tag) throws HiveException { + VectorizedRowBatch batch = (VectorizedRowBatch) row; + + for (VectorExpression orderExpression : orderExpressions) { + orderExpression.evaluate(batch); + } + + if (partitionExpressions != null) { + for (VectorExpression partitionExpression : partitionExpressions) { + partitionExpression.evaluate(batch); + } + } + + // Check for partition key change when we have lower order by keys. + if (isPartitionOrderBy) { + if (isFirstPartition) { + isFirstPartition = false; + setCurrentPartition(batch); + } else if (isPartitionChanged(batch)) { + setCurrentPartition(batch); + groupBatches.resetEvaluators(); + } + } + + if (allEvaluatorsAreStreaming) { + groupBatches.evaluateStreamingGroupBatch(batch); + forward(batch, null); + } else { + + /* + * Evaluate the aggregation functions over the group batch. + */ + groupBatches.evaluateGroupBatch(batch); + + // UNDONE: If not last batch, save contents of nonKeyInputColumnMap columns and do not + // UNDONE: evaluate the batch yet... + // UNDONE: We do need the overflowBatch when reading from storage. And, maybe simpler when + // UNDONE: processing the buffered batches. + if (!isLastGroupBatch) { + // The group spans a VectorizedRowBatch. Swap the relevant columns into our batch buffers, or + // write to temporary storage. + groupBatches.bufferGroupBatch(batch); + return; + } + + /* + * Take the group aggregation values and write output columns for all rows of every batch of + * the group. As each group batch is finished being written, it is forwarded to the next + * operator. + * + * Note the last batch is always our current input batch. + */ + groupBatches.fillGroupResultsAndForward(this, batch); + } + + if (!isPartitionOrderBy) { + groupBatches.resetEvaluators(); + } + } + + private boolean isPartitionChanged(VectorizedRowBatch batch) { + + final int count = partitionColumnMap.length; + for (int i = 0; i < count; i++) { + ColumnVector colVector = batch.cols[partitionColumnMap[i]]; + + // Partition columns are repeated -- so we test element 0. + + final boolean isNull = !colVector.noNulls && colVector.isNull[0]; + final boolean currentIsNull = currentPartitionIsNull[i]; + + if (isNull != currentIsNull) { + return true; + } + if (isNull) { + continue; + } + + switch (partitionColumnVectorTypes[i]) { + case LONG: + if (currentPartitionLongs[i] != ((LongColumnVector) colVector).vector[0]) { + return true; + } + break; + case DOUBLE: + if (currentPartitionDoubles[i] != ((DoubleColumnVector) colVector).vector[0]) { + return true; + } + break; + case BYTES: + { + BytesColumnVector byteColVector = (BytesColumnVector) colVector; + byte[] bytes = byteColVector.vector[0]; + final int start = byteColVector.start[0]; + final int length = byteColVector.length[0]; + if (!StringExpr.equal( + bytes, start, length, + currentPartitionByteArrays[i], 0, currentPartitionByteLengths[i])) { + return true; + } + } + break; + case DECIMAL: + if (!currentPartitionDecimals[i].equals(((DecimalColumnVector) colVector).vector[0])) { + return true; + } + break; + case TIMESTAMP: + if (((TimestampColumnVector) colVector).compareTo(0, currentPartitionTimestamps[i]) != 0) { + return true; + } + break; + case INTERVAL_DAY_TIME: + if (((IntervalDayTimeColumnVector) colVector).compareTo(0, currentPartitionIntervalDayTimes[i]) != 0) { + return true; + } + break; + default: + throw new RuntimeException("Unexpected column vector type " + partitionColumnVectorTypes[i]); + } + } + return false; + } + + private void setCurrentPartition(VectorizedRowBatch batch) { + + final int count = partitionColumnMap.length; + for (int i = 0; i < count; i++) { + ColumnVector colVector = batch.cols[partitionColumnMap[i]]; + + // Partition columns are repeated -- so we test element 0. + + final boolean isNull = !colVector.noNulls && colVector.isNull[0]; + currentPartitionIsNull[i] = isNull; + + if (isNull) { + continue; + } + + switch (partitionColumnVectorTypes[i]) { + case LONG: + currentPartitionLongs[i] = ((LongColumnVector) colVector).vector[0]; + break; + case DOUBLE: + currentPartitionDoubles[i] = ((DoubleColumnVector) colVector).vector[0]; + break; + case BYTES: + { + BytesColumnVector byteColVector = (BytesColumnVector) colVector; + byte[] bytes = byteColVector.vector[0]; + final int start = byteColVector.start[0]; + final int length = byteColVector.length[0]; + if (currentPartitionByteArrays[i] == null || currentPartitionByteLengths[i] < length) { + currentPartitionByteArrays[i] = Arrays.copyOfRange(bytes, start, start + length); + } else { + System.arraycopy(bytes, start, currentPartitionByteArrays[i], 0, length); + } + currentPartitionByteLengths[i] = length; + } + break; + case DECIMAL: + if (currentPartitionDecimals[i] == null) { + currentPartitionDecimals[i] = new HiveDecimalWritable(); + } + currentPartitionDecimals[i].set(((DecimalColumnVector) colVector).vector[0]); + break; + case TIMESTAMP: + if (currentPartitionTimestamps[i] == null) { + currentPartitionTimestamps[i] = new Timestamp(0); + } + ((TimestampColumnVector) colVector).timestampUpdate(currentPartitionTimestamps[i], 0); + break; + case INTERVAL_DAY_TIME: + if (currentPartitionIntervalDayTimes[i] == null) { + currentPartitionIntervalDayTimes[i] = new HiveIntervalDayTime(); + } + ((IntervalDayTimeColumnVector) colVector).intervalDayTimeUpdate(currentPartitionIntervalDayTimes[i], 0); + break; + default: + throw new RuntimeException("Unexpected column vector type " + partitionColumnVectorTypes[i]); + } + } + } + + @Override + public void forward(Object row, ObjectInspector rowInspector) throws HiveException { + super.forward(row, rowInspector); + } + + @Override + protected void closeOp(boolean abort) throws HiveException { + super.closeOp(abort); + + // We do not try to finish and flush an in-progress group because correct values require the + // last group batch. + } + + /** + * @return the name of the operator + */ + @Override + public String getName() { + return getOperatorName(); + } + + static public String getOperatorName() { + return "PTF"; + } + + @Override + public OperatorType getType() { + return OperatorType.PTF; + } + + @Override + public VectorizationContext getOuputVectorizationContext() { + return vOutContext; + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index f0df2e9..6a8a51b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -61,12 +61,21 @@ import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterLongOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorBase; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDenseRank; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleAvg; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleMax; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleMin; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleSum; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorRank; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkLongOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkObjectHashOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkStringOperator; import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; @@ -95,6 +104,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowType; import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc; import org.apache.hadoop.hive.ql.plan.AggregationDesc; import org.apache.hadoop.hive.ql.plan.AppMasterEventDesc; @@ -102,6 +112,7 @@ import org.apache.hadoop.hive.ql.plan.Explain; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc; @@ -112,10 +123,14 @@ import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PTFDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.VectorAppMasterEventDesc; import org.apache.hadoop.hive.ql.plan.VectorFileSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorFilterDesc; +import org.apache.hadoop.hive.ql.plan.VectorPTFDesc; +import org.apache.hadoop.hive.ql.plan.VectorPTFInfo; +import org.apache.hadoop.hive.ql.plan.VectorPTFDesc.SupportedFunctionType; import org.apache.hadoop.hive.ql.plan.VectorTableScanDesc; import org.apache.hadoop.hive.ql.plan.VectorizationCondition; import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc.ProcessingMode; @@ -149,6 +164,13 @@ import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.plan.ptf.OrderExpressionDef; +import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef; +import org.apache.hadoop.hive.ql.plan.ptf.PartitionDef; +import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFunctionDef; +import org.apache.hadoop.hive.ql.plan.ptf.WindowTableFunctionDef; import org.apache.hadoop.hive.ql.udf.UDFAcos; import org.apache.hadoop.hive.ql.udf.UDFAsin; import org.apache.hadoop.hive.ql.udf.UDFAtan; @@ -190,6 +212,8 @@ import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.*; +import org.apache.hadoop.hive.ql.udf.ptf.TableFunctionEvaluator; +import org.apache.hadoop.hive.ql.udf.ptf.WindowingTableFunction; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.NullStructSerDe; @@ -1659,7 +1683,7 @@ private ValidatorVectorizationContext(HiveConf hiveConf) { } @Override - protected int getInputColumnIndex(String name) { + public int getInputColumnIndex(String name) { return 0; } @@ -1818,6 +1842,9 @@ boolean validateReduceWorkOperator(Operator op) { ret = op instanceof SparkHashTableSinkOperator && validateSparkHashTableSinkOperator((SparkHashTableSinkOperator) op); break; + case PTF: + ret = validatePTFOperator((PTFOperator) op); + break; default: setOperatorNotSupported(op); ret = false; @@ -2073,6 +2100,112 @@ private boolean validateFileSinkOperator(FileSinkOperator op) { return true; } + private boolean containsLeadLag(ExprNodeDesc exprNodeDesc) { + if (exprNodeDesc instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc genericFuncDesc = (ExprNodeGenericFuncDesc) exprNodeDesc; + GenericUDF genFuncClass = genericFuncDesc.getGenericUDF(); + if (genFuncClass instanceof GenericUDFLag || + genFuncClass instanceof GenericUDFLead) { + return true; + } + return containsLeadLag(genericFuncDesc.getChildren()); + } else if (exprNodeDesc instanceof ExprNodeColumnDesc || + exprNodeDesc instanceof ExprNodeColumnDesc) { + return false; + } else { + // UNDONE: Other ExprNode* variations? + return false; + } + } + + private boolean containsLeadLag(List exprNodeDescList) { + for (ExprNodeDesc exprNodeDesc : exprNodeDescList) { + if (containsLeadLag(exprNodeDesc)) { + return true; + } + } + return false; + } + + private boolean validatePTFOperator(PTFOperator op) { + PTFDesc ptfDesc = (PTFDesc) op.getConf(); + boolean isMapSide = ptfDesc.isMapSide(); + if (isMapSide) { + setOperatorIssue("PTF Mapper not supported"); + return false; + } + boolean forNoop = ptfDesc.forNoop(); + if (forNoop) { + setOperatorIssue("NOOP not supported"); + return false; + } + boolean forWindowing = ptfDesc.forWindowing(); + if (!forWindowing) { + setOperatorIssue("Windowing required"); + return false; + } + PartitionedTableFunctionDef funcDef = ptfDesc.getFuncDef(); + boolean isWindowTableFunctionDef = (funcDef instanceof WindowTableFunctionDef); + if (isWindowTableFunctionDef) { + + // As a validator, when we pass null for vContext, the output column info will return null. + VectorPTFDesc vectorPTFDesc = null; + try { + vectorPTFDesc = createVectorPTFDesc(op, ptfDesc); + } catch (HiveException e) { + setOperatorIssue("exception: " + VectorizationContext.getStackTraceAsSingleLine(e)); + return false; + } + ptfDesc.setVectorDesc(vectorPTFDesc); + + // UNDONE: Validate outputExprNodeColumns + + boolean isPartitionOrderBy = vectorPTFDesc.getIsPartitionOrderBy(); + String[] evaluatorFunctionNames = vectorPTFDesc.getEvaluatorFunctionNames(); + final int count = evaluatorFunctionNames.length; + WindowFrameDef[] evaluatorWindowFrameDefs = vectorPTFDesc.getEvaluatorWindowFrameDefs(); + ExprNodeDesc[] evaluatorInputExprNodeDescs = vectorPTFDesc.getEvaluatorInputExprNodeDescs(); + + for (int i = 0; i < count; i++) { + String functionName = evaluatorFunctionNames[i]; + SupportedFunctionType supportedFunctionType = VectorPTFDesc.supportedFunctionsMap.get(functionName); + if (supportedFunctionType == null) { + setOperatorIssue(functionName + " not in supported functions " + VectorPTFDesc.supportedFunctionNames); + return false; + } + WindowFrameDef windowFrameDef = evaluatorWindowFrameDefs[i]; + if (!windowFrameDef.isStartUnbounded()) { + setOperatorIssue(functionName + " only UNBOUNDED start frame is supported"); + return false; + } + switch (windowFrameDef.getWindowType()) { + case RANGE: + if (!windowFrameDef.getEnd().isCurrentRow()) { + setOperatorIssue(functionName + " only CURRENT ROW end frame is supported for RANGE"); + return false; + } + break; + case ROWS: + if (!windowFrameDef.isEndUnbounded()) { + setOperatorIssue(functionName + " UNBOUNDED end frame is not supported for ROWS window type"); + return false; + } + break; + default: + throw new RuntimeException("Unexpected window type " + windowFrameDef.getWindowType()); + } + ExprNodeDesc exprNodeDesc = evaluatorInputExprNodeDescs[i]; + if (containsLeadLag(exprNodeDesc)) { + setOperatorIssue("lead and lag function not supported in argument expression of aggregation function " + functionName); + return false; + } + } + return true; + } + setOperatorIssue("Not quite ready yet validatePTFOperator"); + return false; + } + private boolean validateExprNodeDesc(List descs, String expressionTitle) { return validateExprNodeDesc(descs, expressionTitle, VectorExpressionDescriptor.Mode.PROJECTION); } @@ -3334,6 +3467,309 @@ private boolean usesVectorUDFAdaptor(VectorExpression[] vecExprs) { selectOp.getCompilationOpContext(), selectDesc, vContext, selectOp); } + private static void fillInPTFEvaluators( + List windowsFunctions, + String[] evaluatorFunctionNames, + WindowFrameDef[] evaluatorWindowFrameDefs, + ExprNodeDesc[] evaluatorInputExprNodeDescs) throws HiveException { + final int functionCount = windowsFunctions.size(); + for (int i = 0; i < functionCount; i++) { + WindowFunctionDef winFunc = windowsFunctions.get(i); + evaluatorFunctionNames[i] = winFunc.getName(); + evaluatorWindowFrameDefs[i] = winFunc.getWindowFrame(); + + List args = winFunc.getArgs(); + if (args != null) { + + // UNDONE: Just one argument? + PTFExpressionDef arg = args.get(0); + + evaluatorInputExprNodeDescs[i] = arg.getExprNode(); + } + } + } + + private static ExprNodeDesc[] getPartitionExprNodeDescs(List partitionExpressions) { + final int size = partitionExpressions.size(); + ExprNodeDesc[] exprNodeDescs = new ExprNodeDesc[size]; + for (int i = 0; i < size; i++) { + exprNodeDescs[i] = partitionExpressions.get(i).getExprNode(); + } + return exprNodeDescs; + } + + private static ExprNodeDesc[] getOrderExprNodeDescs(List orderExpressions) { + final int size = orderExpressions.size(); + ExprNodeDesc[] exprNodeDescs = new ExprNodeDesc[size]; + for (int i = 0; i < size; i++) { + exprNodeDescs[i] = orderExpressions.get(i).getExprNode(); + } + return exprNodeDescs; + } + + private static VectorPTFDesc createVectorPTFDesc(Operator ptfOp, + PTFDesc ptfDesc) throws HiveException { + + // IMPORTANT NOTE: partitionExprNodeDescs and orderExprNodeDescs are MAP-SIDE expressions. + // On the REDUCER-SIDE, we just have columns (ExprNodeColumnDesc). I.e. We assume for PTF + // there are not REDUCER-SIDE constants is inferred at planning time. + + PartitionedTableFunctionDef funcDef = ptfDesc.getFuncDef(); + + ArrayList outputSignature = ptfOp.getSchema().getSignature(); + final int outputSize = outputSignature.size(); + + List partitionExpressions = funcDef.getPartition().getExpressions(); + final int partitionKeyCount = partitionExpressions.size(); + ExprNodeDesc[] partitionExprNodeDescs = getPartitionExprNodeDescs(partitionExpressions); + + List orderExpressions = funcDef.getOrder().getExpressions(); + final int orderKeyCount = orderExpressions.size(); + ExprNodeDesc[] orderExprNodeDescs = getOrderExprNodeDescs(orderExpressions); + + // When there are PARTITION and ORDER BY clauses, will have different partition expressions. + // Otherwise, only order by expressions. + boolean isPartitionOrderBy = false; + + if (partitionKeyCount != orderKeyCount) { + // Obviously different expressions. + isPartitionOrderBy = true; + } else { + // Check each ExprNodeDesc. + for (int i = 0; i < partitionKeyCount; i++) { + final ExprNodeDescEqualityWrapper partitionExprEqualityWrapper = + new ExprNodeDesc.ExprNodeDescEqualityWrapper(partitionExprNodeDescs[i]); + final ExprNodeDescEqualityWrapper orderExprEqualityWrapper = + new ExprNodeDesc.ExprNodeDescEqualityWrapper(orderExprNodeDescs[i]); + if (!partitionExprEqualityWrapper.equals(orderExprEqualityWrapper)) { + isPartitionOrderBy = true; + break; + } + } + } + + WindowTableFunctionDef windowTableFunctionDef = (WindowTableFunctionDef) funcDef; + List windowsFunctions = windowTableFunctionDef.getWindowFunctions(); + final int functionCount = windowsFunctions.size(); + + String[] evaluatorFunctionNames = new String[functionCount]; + WindowFrameDef[] evaluatorWindowFrameDefs = new WindowFrameDef[functionCount]; + ExprNodeDesc[] evaluatorInputExprNodeDescs = new ExprNodeDesc[functionCount]; + + fillInPTFEvaluators( + windowsFunctions, + evaluatorFunctionNames, + evaluatorWindowFrameDefs, + evaluatorInputExprNodeDescs); + + VectorPTFDesc vectorPTFDesc = new VectorPTFDesc(); + + vectorPTFDesc.setIsPartitionOrderBy(isPartitionOrderBy); + + vectorPTFDesc.setOrderExprNodeDescs(orderExprNodeDescs); + vectorPTFDesc.setPartitionExprNodeDescs(partitionExprNodeDescs); + + vectorPTFDesc.setEvaluatorFunctionNames(evaluatorFunctionNames); + vectorPTFDesc.setEvaluatorWindowFrameDefs(evaluatorWindowFrameDefs); + vectorPTFDesc.setEvaluatorInputExprNodeDescs(evaluatorInputExprNodeDescs); + + return vectorPTFDesc; + } + + private static void determineKeyAndNonKeyInputColumnMap(int[] outputColumnMap, + boolean isPartitionOrderBy, int[] orderColumnMap, int[] partitionColumnMap, + int evaluatorCount, ArrayList keyInputColumns, + ArrayList nonKeyInputColumns) { + + final int outputSize = outputColumnMap.length; + final int orderKeyCount = orderColumnMap.length; + final int partitionKeyCount = (isPartitionOrderBy ? partitionColumnMap.length : 0); + for (int i = evaluatorCount; i < outputSize; i++) { + final int nonEvalColumnNum = outputColumnMap[i]; + boolean isKey = false; + for (int o = 0; o < orderKeyCount; o++) { + if (nonEvalColumnNum == orderColumnMap[o]) { + isKey = true; + break; + } + } + if (!isKey && isPartitionOrderBy) { + for (int p = 0; p < partitionKeyCount; p++) { + if (nonEvalColumnNum == partitionColumnMap[p]) { + isKey = true; + break; + } + } + } + if (isKey) { + keyInputColumns.add(nonEvalColumnNum); + } else { + nonKeyInputColumns.add(nonEvalColumnNum); + } + } + } + + private static VectorPTFInfo createVectorPTFInfo(Operator ptfOp, + PTFDesc ptfDesc, VectorizationContext vContext) throws HiveException { + + PartitionedTableFunctionDef funcDef = ptfDesc.getFuncDef(); + + ArrayList outputSignature = ptfOp.getSchema().getSignature(); + final int outputSize = outputSignature.size(); + + VectorPTFDesc vectorPTFDesc = (VectorPTFDesc) ptfDesc.getVectorDesc(); + + boolean isPartitionOrderBy = vectorPTFDesc.getIsPartitionOrderBy(); + ExprNodeDesc[] orderExprNodeDescs = vectorPTFDesc.getOrderExprNodeDescs(); + ExprNodeDesc[] partitionExprNodeDescs = vectorPTFDesc.getPartitionExprNodeDescs(); + String[] evaluatorFunctionNames = vectorPTFDesc.getEvaluatorFunctionNames(); + + final int evaluatorCount = evaluatorFunctionNames.length; + WindowFrameDef[] evaluatorWindowFrameDefs = vectorPTFDesc.getEvaluatorWindowFrameDefs(); + ExprNodeDesc[] evaluatorInputExprNodeDescs = vectorPTFDesc.getEvaluatorInputExprNodeDescs(); + + /* + * Output columns. + */ + int[] outputColumnMap = new int[outputSize]; + String[] outputColumnNames = new String[outputSize]; + TypeInfo[] outputTypeInfos = new TypeInfo[outputSize]; + for (int i = 0; i < evaluatorCount; i++) { + ColumnInfo colInfo = outputSignature.get(i); + TypeInfo typeInfo = colInfo.getType(); + final int outputColumnNum; + outputColumnNum = vContext.allocateScratchColumn(typeInfo); + outputColumnMap[i] = outputColumnNum; + outputColumnNames[i] = colInfo.getInternalName(); + outputTypeInfos[i] = typeInfo; + } + for (int i = evaluatorCount; i < outputSize; i++) { + ColumnInfo colInfo = outputSignature.get(i); + outputColumnMap[i] = vContext.getInputColumnIndex(colInfo.getInternalName()); + outputColumnNames[i] = colInfo.getInternalName(); + outputTypeInfos[i] = colInfo.getType(); + } + + /* + * Partition and order by. + */ + + int[] partitionColumnMap; + Type[] partitionColumnVectorTypes; + VectorExpression[] partitionExpressions; + + if (!isPartitionOrderBy) { + partitionColumnMap = null; + partitionColumnVectorTypes = null; + partitionExpressions = null; + } else { + final int partitionKeyCount = partitionExprNodeDescs.length; + partitionColumnMap = new int[partitionKeyCount]; + partitionColumnVectorTypes = new Type[partitionKeyCount]; + partitionExpressions = new VectorExpression[partitionKeyCount]; + + for (int i = 0; i < partitionKeyCount; i++) { + VectorExpression partitionExpression = vContext.getVectorExpression(partitionExprNodeDescs[i]); + String typeName = partitionExpression.getOutputType(); + typeName = VectorizationContext.mapTypeNameSynonyms(typeName); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + Type columnVectorType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo); + partitionColumnVectorTypes[i] = columnVectorType; + partitionColumnMap[i] = partitionExpression.getOutputColumn(); + partitionExpressions[i] = partitionExpression; + } + } + + final int orderKeyCount = orderExprNodeDescs.length; + int[] orderColumnMap = new int[orderKeyCount]; + Type[] orderColumnVectorTypes = new Type[orderKeyCount]; + VectorExpression[] orderExpressions = new VectorExpression[orderKeyCount]; + for (int i = 0; i < orderKeyCount; i++) { + VectorExpression orderExpression = vContext.getVectorExpression(orderExprNodeDescs[i]); + String typeName = orderExpression.getOutputType(); + typeName = VectorizationContext.mapTypeNameSynonyms(typeName); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); + Type columnVectorType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo); + orderColumnVectorTypes[i] = columnVectorType; + orderColumnMap[i] = orderExpression.getOutputColumn(); + orderExpressions[i] = orderExpression; + } + + ArrayList keyInputColumns = new ArrayList(); + ArrayList nonKeyInputColumns = new ArrayList(); + determineKeyAndNonKeyInputColumnMap(outputColumnMap, isPartitionOrderBy, orderColumnMap, + partitionColumnMap, evaluatorCount, keyInputColumns, nonKeyInputColumns); + int[] keyInputColumnMap = ArrayUtils.toPrimitive(keyInputColumns.toArray(new Integer[0])); + int[] nonKeyInputColumnMap = ArrayUtils.toPrimitive(nonKeyInputColumns.toArray(new Integer[0])); + + VectorExpression[] evaluatorInputExpressions = new VectorExpression[evaluatorCount]; + Type[] evaluatorInputColumnVectorTypes = new Type[evaluatorCount]; + for (int i = 0; i < evaluatorCount; i++) { + String functionName = evaluatorFunctionNames[i]; + WindowFrameDef windowFrameDef = evaluatorWindowFrameDefs[i]; + SupportedFunctionType functionType = VectorPTFDesc.supportedFunctionsMap.get(functionName); + ExprNodeDesc exprNodeDesc = evaluatorInputExprNodeDescs[i]; + VectorExpression inputVectorExpression; + final Type columnVectorType; + if (exprNodeDesc != null) { + + // Determine input vector expression using the VectorizationContext. + inputVectorExpression = vContext.getVectorExpression(exprNodeDesc); + + TypeInfo typeInfo = exprNodeDesc.getTypeInfo(); + PrimitiveCategory primitiveCategory = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory(); + columnVectorType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo); + } else { + inputVectorExpression = null; + columnVectorType = ColumnVector.Type.NONE; + } + + evaluatorInputExpressions[i] = inputVectorExpression; + evaluatorInputColumnVectorTypes[i] = columnVectorType; + } + + VectorPTFInfo vectorPTFInfo = new VectorPTFInfo(); + + vectorPTFInfo.setOutputColumnMap(outputColumnMap); + vectorPTFInfo.setOutputColumnNames(outputColumnNames); + vectorPTFInfo.setOutputTypeInfos(outputTypeInfos); + + vectorPTFInfo.setPartitionColumnMap(partitionColumnMap); + vectorPTFInfo.setPartitionColumnVectorTypes(partitionColumnVectorTypes); + vectorPTFInfo.setPartitionExpressions(partitionExpressions); + + vectorPTFInfo.setOrderColumnMap(orderColumnMap); + vectorPTFInfo.setOrderColumnVectorTypes(orderColumnVectorTypes); + vectorPTFInfo.setOrderExpressions(orderExpressions); + + vectorPTFInfo.setEvaluatorInputExpressions(evaluatorInputExpressions); + vectorPTFInfo.setEvaluatorInputColumnVectorTypes(evaluatorInputColumnVectorTypes); + + vectorPTFInfo.setKeyInputColumnMap(keyInputColumnMap); + vectorPTFInfo.setNonKeyInputColumnMap(nonKeyInputColumnMap); + + return vectorPTFInfo; + } + + /* + * NOTE: The VectorPTFDesc has already been allocated and partially populated. + */ + public static Operator vectorizePTFOperator( + Operator ptfOp, VectorizationContext vContext) + throws HiveException { + PTFDesc ptfDesc = (PTFDesc) ptfOp.getConf(); + + VectorPTFDesc vectorPTFDesc = (VectorPTFDesc) ptfDesc.getVectorDesc(); + + VectorPTFInfo vectorPTFInfo = createVectorPTFInfo(ptfOp, ptfDesc, vContext); + + vectorPTFDesc.setVectorPTFInfo(vectorPTFInfo); + + Class> opClass = VectorPTFOperator.class; + return OperatorFactory.getVectorOperator( + ptfOp.getCompilationOpContext(), ptfDesc, vContext, ptfOp); + } + public Operator vectorizeOperator(Operator op, VectorizationContext vContext, boolean isTezOrSpark, VectorTaskColumnInfo vectorTaskColumnInfo) throws HiveException { @@ -3508,6 +3944,10 @@ private boolean usesVectorUDFAdaptor(VectorExpression[] vecExprs) { isNative = true; } break; + case PTF: + vectorOp = vectorizePTFOperator(op, vContext); + isNative = true; + break; case HASHTABLESINK: { SparkHashTableSinkDesc sparkHashTableSinkDesc = (SparkHashTableSinkDesc) op.getConf(); diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/PTFDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/PTFDesc.java index c4b49b6..229e018 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/PTFDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PTFDesc.java @@ -20,16 +20,28 @@ import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorBase; import org.apache.hadoop.hive.ql.parse.LeadLagInfo; import org.apache.hadoop.hive.ql.plan.Explain.Level; +import org.apache.hadoop.hive.ql.plan.Explain.Vectorization; +import org.apache.hadoop.hive.ql.plan.VectorPTFDesc.SupportedFunctionType; import org.apache.hadoop.hive.ql.plan.ptf.PTFInputDef; import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; import org.apache.hadoop.hive.ql.plan.ptf.WindowTableFunctionDef; import org.apache.hadoop.hive.ql.udf.ptf.Noop; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -114,4 +126,101 @@ public Configuration getCfg() { public void setCfg(Configuration cfg) { this.cfg = cfg; } + + public class PTFOperatorExplainVectorization extends OperatorExplainVectorization { + + private final PTFDesc PTFDesc; + private final VectorPTFDesc vectorPTFDesc; + private final VectorPTFInfo vectorPTFInfo; + + private VectorizationCondition[] nativeConditions; + + public PTFOperatorExplainVectorization(PTFDesc PTFDesc, VectorDesc vectorDesc) { + // VectorPTFOperator is native vectorized. + super(vectorDesc, true); + this.PTFDesc = PTFDesc; + vectorPTFDesc = (VectorPTFDesc) vectorDesc; + vectorPTFInfo = vectorPTFDesc.getVectorPTFInfo(); + } + + @Explain(vectorization = Vectorization.EXPRESSION, displayName = "functionNames", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getFunctionNames() { + return Arrays.toString(vectorPTFDesc.getEvaluatorFunctionNames()); + } + + @Explain(vectorization = Vectorization.EXPRESSION, displayName = "functionInputExpressions", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getFunctionInputExpressions() { + return Arrays.toString(vectorPTFInfo.getEvaluatorInputExpressions()); + } + + @Explain(vectorization = Vectorization.EXPRESSION, displayName = "partitionExpressions", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getPartitionExpressions() { + VectorExpression[] partitionExpressions = vectorPTFInfo.getPartitionExpressions(); + if (partitionExpressions == null) { + return null; + } + return Arrays.toString(partitionExpressions); + } + + @Explain(vectorization = Vectorization.EXPRESSION, displayName = "orderExpressions", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getOrderExpressions() { + VectorExpression[] orderExpressions = vectorPTFInfo.getOrderExpressions(); + if (orderExpressions == null) { + return null; + } + return Arrays.toString(orderExpressions); + } + + @Explain(vectorization = Vectorization.EXPRESSION, displayName = "evaluatorClasses", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getEvaluatorClasses() { + + VectorPTFEvaluatorBase[] evaluators = VectorPTFDesc.getEvaluators(vectorPTFDesc, vectorPTFInfo); + + ArrayList result = new ArrayList(evaluators.length); + for (VectorPTFEvaluatorBase evaluator : evaluators) { + result.add(evaluator.getClass().getSimpleName()); + } + return result.toString(); + } + + @Explain(vectorization = Vectorization.DETAIL, displayName = "outputColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getOutputColumns() { + return Arrays.toString(vectorPTFInfo.getOutputColumnMap()); + } + + @Explain(vectorization = Vectorization.DETAIL, displayName = "outputTypes", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getOutputTypes() { + return Arrays.toString(vectorPTFInfo.getOutputTypeInfos()); + } + + @Explain(vectorization = Vectorization.DETAIL, displayName = "keyInputColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getKeyInputColumns() { + return Arrays.toString(vectorPTFInfo.getKeyInputColumnMap()); + } + + @Explain(vectorization = Vectorization.DETAIL, displayName = "nonKeyInputColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getNonKeyInputColumns() { + return Arrays.toString(vectorPTFInfo.getNonKeyInputColumnMap()); + } + + @Explain(vectorization = Vectorization.DETAIL, displayName = "streamingColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getStreamingColumns() { + VectorPTFEvaluatorBase[] evaluators = VectorPTFDesc.getEvaluators(vectorPTFDesc, vectorPTFInfo); + ArrayList result = new ArrayList(); + for (VectorPTFEvaluatorBase evaluator : evaluators) { + if (evaluator.streamsResult()) { + result.add(evaluator.getOutputColumnNum()); + } + } + return result.toString(); + } + } + + @Explain(vectorization = Vectorization.OPERATOR, displayName = "PTF Vectorization", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public PTFOperatorExplainVectorization getPTFVectorization() { + if (vectorDesc == null) { + return null; + } + return new PTFOperatorExplainVectorization(this, vectorDesc); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPTFDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPTFDesc.java new file mode 100644 index 0000000..12df583 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPTFDesc.java @@ -0,0 +1,338 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.TreeSet; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorBase; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorCount; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorCountStar; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDecimalAvg; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDecimalFirstValue; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDecimalLastValue; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDecimalMax; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDecimalMin; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDecimalSum; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDenseRank; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleAvg; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleFirstValue; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleLastValue; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleMax; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleMin; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorDoubleSum; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorLongAvg; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorLongFirstValue; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorLongLastValue; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorLongMax; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorLongMin; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorLongSum; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorRank; +import org.apache.hadoop.hive.ql.exec.vector.ptf.VectorPTFEvaluatorRowNumber; +import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * VectorPTFDesc. + * + * Extra parameters beyond PTFDesc just for the VectorPTFOperator. + * + * We don't extend PTFDesc because the base OperatorDesc doesn't support + * clone and adding it is a lot work for little gain. + */ +public class VectorPTFDesc extends AbstractVectorDesc { + + private static final long serialVersionUID = 1L; + + public static enum SupportedFunctionType { + ROW_NUMBER, + RANK, + DENSE_RANK, + MIN, + MAX, + SUM, + AVG, + FIRST_VALUE, + LAST_VALUE, + COUNT + } + + public static HashMap supportedFunctionsMap = + new HashMap(); + static { + supportedFunctionsMap.put("row_number", SupportedFunctionType.ROW_NUMBER); + supportedFunctionsMap.put("rank", SupportedFunctionType.RANK); + supportedFunctionsMap.put("dense_rank", SupportedFunctionType.DENSE_RANK); + supportedFunctionsMap.put("min", SupportedFunctionType.MIN); + supportedFunctionsMap.put("max", SupportedFunctionType.MAX); + supportedFunctionsMap.put("sum", SupportedFunctionType.SUM); + supportedFunctionsMap.put("avg", SupportedFunctionType.AVG); + supportedFunctionsMap.put("first_value", SupportedFunctionType.FIRST_VALUE); + supportedFunctionsMap.put("last_value", SupportedFunctionType.LAST_VALUE); + supportedFunctionsMap.put("count", SupportedFunctionType.COUNT); + } + public static List supportedFunctionNames = new ArrayList(); + static { + TreeSet treeSet = new TreeSet(); + treeSet.addAll(supportedFunctionsMap.keySet()); + supportedFunctionNames.addAll(treeSet); + } + + private boolean isPartitionOrderBy; + + private String[] evaluatorFunctionNames; + private WindowFrameDef[] evaluatorWindowFrameDefs; + private ExprNodeDesc[] evaluatorInputExprNodeDescs; + + private ExprNodeDesc[] orderExprNodeDescs; + private ExprNodeDesc[] partitionExprNodeDescs; + + private VectorPTFInfo vectorPTFInfo; + + public VectorPTFDesc() { + isPartitionOrderBy = false; + + evaluatorFunctionNames = null; + evaluatorInputExprNodeDescs = null; + + orderExprNodeDescs = null; + partitionExprNodeDescs = null; + } + + // We provide this public method to help EXPLAIN VECTORIZATION show the evaluator classes. + public static VectorPTFEvaluatorBase getEvaluator(SupportedFunctionType functionType, + WindowFrameDef windowFrameDef, Type columnVectorType, VectorExpression inputVectorExpression, + int outputColumnNum) { + + VectorPTFEvaluatorBase evaluator; + switch (functionType) { + case ROW_NUMBER: + evaluator = new VectorPTFEvaluatorRowNumber(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case RANK: + evaluator = new VectorPTFEvaluatorRank(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DENSE_RANK: + evaluator = new VectorPTFEvaluatorDenseRank(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case MIN: + switch (columnVectorType) { + case LONG: + evaluator = new VectorPTFEvaluatorLongMin(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DOUBLE: + evaluator = new VectorPTFEvaluatorDoubleMin(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DECIMAL: + evaluator = new VectorPTFEvaluatorDecimalMin(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + default: + throw new RuntimeException("Unexpected column vector type " + columnVectorType + " for " + functionType); + } + break; + case MAX: + switch (columnVectorType) { + case LONG: + evaluator = new VectorPTFEvaluatorLongMax(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DOUBLE: + evaluator = new VectorPTFEvaluatorDoubleMax(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DECIMAL: + evaluator = new VectorPTFEvaluatorDecimalMax(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + default: + throw new RuntimeException("Unexpected column vector type " + columnVectorType + " for " + functionType); + } + break; + case SUM: + switch (columnVectorType) { + case LONG: + evaluator = new VectorPTFEvaluatorLongSum(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DOUBLE: + evaluator = new VectorPTFEvaluatorDoubleSum(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DECIMAL: + evaluator = new VectorPTFEvaluatorDecimalSum(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + default: + throw new RuntimeException("Unexpected column vector type " + columnVectorType + " for " + functionType); + } + break; + case AVG: + switch (columnVectorType) { + case LONG: + evaluator = new VectorPTFEvaluatorLongAvg(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DOUBLE: + evaluator = new VectorPTFEvaluatorDoubleAvg(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DECIMAL: + evaluator = new VectorPTFEvaluatorDecimalAvg(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + default: + throw new RuntimeException("Unexpected column vector type " + columnVectorType + " for " + functionType); + } + break; + case FIRST_VALUE: + switch (columnVectorType) { + case LONG: + evaluator = new VectorPTFEvaluatorLongFirstValue(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DOUBLE: + evaluator = new VectorPTFEvaluatorDoubleFirstValue(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DECIMAL: + evaluator = new VectorPTFEvaluatorDecimalFirstValue(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + default: + throw new RuntimeException("Unexpected column vector type " + columnVectorType + " for " + functionType); + } + break; + case LAST_VALUE: + switch (columnVectorType) { + case LONG: + evaluator = new VectorPTFEvaluatorLongLastValue(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DOUBLE: + evaluator = new VectorPTFEvaluatorDoubleLastValue(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + case DECIMAL: + evaluator = new VectorPTFEvaluatorDecimalLastValue(windowFrameDef, inputVectorExpression, outputColumnNum); + break; + default: + throw new RuntimeException("Unexpected column vector type " + columnVectorType + " for " + functionType); + } + break; + case COUNT: + if (inputVectorExpression == null) { + evaluator = new VectorPTFEvaluatorCountStar(windowFrameDef, inputVectorExpression, outputColumnNum); + } else { + evaluator = new VectorPTFEvaluatorCount(windowFrameDef, inputVectorExpression, outputColumnNum); + } + break; + default: + throw new RuntimeException("Unexpected function type " + functionType); + } + return evaluator; + } + + public static VectorPTFEvaluatorBase[] getEvaluators(VectorPTFDesc vectorPTFDesc, VectorPTFInfo vectorPTFInfo) { + String[] evaluatorFunctionNames = vectorPTFDesc.getEvaluatorFunctionNames(); + int evaluatorCount = evaluatorFunctionNames.length; + WindowFrameDef[] evaluatorWindowFrameDefs = vectorPTFDesc.getEvaluatorWindowFrameDefs(); + VectorExpression[] evaluatorInputExpressions = vectorPTFInfo.getEvaluatorInputExpressions(); + Type[] evaluatorInputColumnVectorTypes = vectorPTFInfo.getEvaluatorInputColumnVectorTypes(); + + int[] outputColumnMap = vectorPTFInfo.getOutputColumnMap(); + + VectorPTFEvaluatorBase[] evaluators = new VectorPTFEvaluatorBase[evaluatorCount]; + for (int i = 0; i < evaluatorCount; i++) { + String functionName = evaluatorFunctionNames[i]; + WindowFrameDef windowFrameDef = evaluatorWindowFrameDefs[i]; + SupportedFunctionType functionType = VectorPTFDesc.supportedFunctionsMap.get(functionName); + VectorExpression inputVectorExpression = evaluatorInputExpressions[i]; + final Type columnVectorType = evaluatorInputColumnVectorTypes[i]; + + // The output* arrays start at index 0 for output evaluator aggregations. + final int outputColumnNum = outputColumnMap[i]; + + VectorPTFEvaluatorBase evaluator = + VectorPTFDesc.getEvaluator( + functionType, windowFrameDef, columnVectorType, inputVectorExpression, outputColumnNum); + + evaluators[i] = evaluator; + } + return evaluators; + } + + public static int[] getStreamingColumnMap(VectorPTFEvaluatorBase[] evaluators) { + final int evaluatorCount = evaluators.length; + ArrayList streamingColumns = new ArrayList(); + for (int i = 0; i < evaluatorCount; i++) { + final VectorPTFEvaluatorBase evaluator = evaluators[i]; + if (evaluator.streamsResult()) { + streamingColumns.add(evaluator.getOutputColumnNum()); + } + } + return ArrayUtils.toPrimitive(streamingColumns.toArray(new Integer[0])); + } + + public boolean getIsPartitionOrderBy() { + return isPartitionOrderBy; + } + + public void setIsPartitionOrderBy(boolean isPartitionOrderBy) { + this.isPartitionOrderBy = isPartitionOrderBy; + } + + public String[] getEvaluatorFunctionNames() { + return evaluatorFunctionNames; + } + + public void setEvaluatorFunctionNames(String[] evaluatorFunctionNames) { + this.evaluatorFunctionNames = evaluatorFunctionNames; + } + + public WindowFrameDef[] getEvaluatorWindowFrameDefs() { + return evaluatorWindowFrameDefs; + } + + public void setEvaluatorWindowFrameDefs(WindowFrameDef[] evaluatorWindowFrameDefs) { + this.evaluatorWindowFrameDefs = evaluatorWindowFrameDefs; + } + + public ExprNodeDesc[] getEvaluatorInputExprNodeDescs() { + return evaluatorInputExprNodeDescs; + } + + public void setEvaluatorInputExprNodeDescs(ExprNodeDesc[] evaluatorInputExprNodeDescs) { + this.evaluatorInputExprNodeDescs = evaluatorInputExprNodeDescs; + } + + public ExprNodeDesc[] getOrderExprNodeDescs() { + return orderExprNodeDescs; + } + + public void setOrderExprNodeDescs(ExprNodeDesc[] orderExprNodeDescs) { + this.orderExprNodeDescs = orderExprNodeDescs; + } + + public ExprNodeDesc[] getPartitionExprNodeDescs() { + return partitionExprNodeDescs; + } + + public void setPartitionExprNodeDescs(ExprNodeDesc[] partitionExprNodeDescs) { + this.partitionExprNodeDescs = partitionExprNodeDescs; + } + + public void setVectorPTFInfo(VectorPTFInfo vectorPTFInfo) { + this.vectorPTFInfo = vectorPTFInfo; + } + + public VectorPTFInfo getVectorPTFInfo() { + return vectorPTFInfo; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPTFInfo.java ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPTFInfo.java new file mode 100644 index 0000000..3affb22 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPTFInfo.java @@ -0,0 +1,181 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * VectorGroupByAggregrationInfo. + * + * A convenience data structure that has information needed to vectorize reduce sink. + * + * It is created by the Vectorizer when it is determining whether it can specialize so the + * information doesn't have to be recreated again and against by the VectorPTFOperator's + * constructors and later during execution. + */ +public class VectorPTFInfo { + + private static final long serialVersionUID = 1L; + + private int[] outputColumnMap; + private String[] outputColumnNames; + private TypeInfo[] outputTypeInfos; + + private int[] orderColumnMap; + private Type[] orderColumnVectorTypes; + private VectorExpression[] orderExpressions; + + private int[] partitionColumnMap; + private Type[] partitionColumnVectorTypes; + private VectorExpression[] partitionExpressions; + + private VectorExpression[] evaluatorInputExpressions; + private Type[] evaluatorInputColumnVectorTypes; + + private int[] keyInputColumnMap; + private int[] nonKeyInputColumnMap; + + public VectorPTFInfo() { + + outputColumnMap = null; + outputColumnNames = null; + outputTypeInfos = null; + + orderColumnMap = null; + orderColumnVectorTypes = null; + orderExpressions = null; + + partitionColumnMap = null; + partitionColumnVectorTypes = null; + partitionExpressions = null; + + evaluatorInputExpressions = null; + evaluatorInputColumnVectorTypes = null; + + keyInputColumnMap = null; + nonKeyInputColumnMap = null; + } + + public int[] getOutputColumnMap() { + return outputColumnMap; + } + + public void setOutputColumnMap(int[] outputColumnMap) { + this.outputColumnMap = outputColumnMap; + } + + public String[] getOutputColumnNames() { + return outputColumnNames; + } + + public void setOutputColumnNames(String[] outputColumnNames) { + this.outputColumnNames = outputColumnNames; + } + + public TypeInfo[] getOutputTypeInfos() { + return outputTypeInfos; + } + + public void setOutputTypeInfos(TypeInfo[] outputTypeInfos) { + this.outputTypeInfos = outputTypeInfos; + } + + public int[] getOrderColumnMap() { + return orderColumnMap; + } + + public void setOrderColumnMap(int[] orderColumnMap) { + this.orderColumnMap = orderColumnMap; + } + + public Type[] getOrderColumnVectorTypes() { + return orderColumnVectorTypes; + } + + public void setOrderColumnVectorTypes(Type[] orderColumnVectorTypes) { + this.orderColumnVectorTypes = orderColumnVectorTypes; + } + + public VectorExpression[] getOrderExpressions() { + return orderExpressions; + } + + public void setOrderExpressions(VectorExpression[] orderExpressions) { + this.orderExpressions = orderExpressions; + } + + public int[] getPartitionColumnMap() { + return partitionColumnMap; + } + + public void setPartitionColumnMap(int[] partitionColumnMap) { + this.partitionColumnMap = partitionColumnMap; + } + + public Type[] getPartitionColumnVectorTypes() { + return partitionColumnVectorTypes; + } + + public void setPartitionColumnVectorTypes(Type[] partitionColumnVectorTypes) { + this.partitionColumnVectorTypes = partitionColumnVectorTypes; + } + + public VectorExpression[] getPartitionExpressions() { + return partitionExpressions; + } + + public void setPartitionExpressions(VectorExpression[] partitionExpressions) { + this.partitionExpressions = partitionExpressions; + } + + public VectorExpression[] getEvaluatorInputExpressions() { + return evaluatorInputExpressions; + } + + public void setEvaluatorInputExpressions(VectorExpression[] evaluatorInputExpressions) { + this.evaluatorInputExpressions = evaluatorInputExpressions; + } + + public Type[] getEvaluatorInputColumnVectorTypes() { + return evaluatorInputColumnVectorTypes; + } + + public void setEvaluatorInputColumnVectorTypes(Type[] evaluatorInputColumnVectorTypes) { + this.evaluatorInputColumnVectorTypes = evaluatorInputColumnVectorTypes; + } + + public int[] getKeyInputColumnMap() { + return keyInputColumnMap; + } + + public void setKeyInputColumnMap(int[] keyInputColumnMap) { + this.keyInputColumnMap = keyInputColumnMap; + } + + public int[] getNonKeyInputColumnMap() { + return nonKeyInputColumnMap; + } + + public void setNonKeyInputColumnMap(int[] nonKeyInputColumnMap) { + this.nonKeyInputColumnMap = nonKeyInputColumnMap; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ptf/WindowFrameDef.java ql/src/java/org/apache/hadoop/hive/ql/plan/ptf/WindowFrameDef.java index 0af878b..346abe3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ptf/WindowFrameDef.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ptf/WindowFrameDef.java @@ -78,6 +78,6 @@ public int getWindowSize() { @Override public String toString() { - return start + "~" + end; + return windowType + " " + start + "~" + end; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLeadLag.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLeadLag.java index b3b36bc..bec0370 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLeadLag.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLeadLag.java @@ -146,7 +146,9 @@ public void setAmt(int amt) { @Override public String getDisplayString(String[] children) { - assert (children.length == 2); + if (children.length != 2) { + return _getFnName() + "(...)"; + } return getStandardDisplayString(_getFnName(), children); } diff --git ql/src/test/queries/clientpositive/vector_ptf_part_simple.q ql/src/test/queries/clientpositive/vector_ptf_part_simple.q new file mode 100644 index 0000000..4f3a538 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_ptf_part_simple.q @@ -0,0 +1,268 @@ +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +create table vector_ptf_part_simple_text(p_mfgr string, p_name string, p_retailprice double) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../../data/files/vector_ptf_part_simple.txt' OVERWRITE INTO TABLE vector_ptf_part_simple_text; + +create table vector_ptf_part_simple_orc(p_mfgr string, p_name string, p_retailprice double) stored as orc; +INSERT INTO TABLE vector_ptf_part_simple_orc SELECT * FROM vector_ptf_part_simple_text; + +select * from vector_ptf_part_simple_orc; + + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +row_number() over(partition by p_mfgr) as rn, +rank() over(partition by p_mfgr) as r, +dense_rank() over(partition by p_mfgr) as dr, +first_value(p_retailprice) over(partition by p_mfgr) as fv, +last_value(p_retailprice) over(partition by p_mfgr) as lv, +count(p_retailprice) over(partition by p_mfgr) as c, +count(*) over(partition by p_mfgr) as cs +from vector_ptf_part_simple_orc; + +select p_mfgr,p_name, p_retailprice, +row_number() over(partition by p_mfgr) as rn, +rank() over(partition by p_mfgr) as r, +dense_rank() over(partition by p_mfgr) as dr, +first_value(p_retailprice) over(partition by p_mfgr) as fv, +last_value(p_retailprice) over(partition by p_mfgr) as lv, +count(p_retailprice) over(partition by p_mfgr) as c, +count(*) over(partition by p_mfgr) as cs +from vector_ptf_part_simple_orc; + + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +row_number() over(partition by p_mfgr order by p_name) as rn, +rank() over(partition by p_mfgr order by p_name) as r, +dense_rank() over(partition by p_mfgr order by p_name) as dr, +first_value(p_retailprice) over(partition by p_mfgr order by p_name) as fv, +last_value(p_retailprice) over(partition by p_mfgr order by p_name) as lv, +count(p_retailprice) over(partition by p_mfgr order by p_name) as c, +count(*) over(partition by p_mfgr order by p_name) as cs +from vector_ptf_part_simple_orc; + +select p_mfgr,p_name, p_retailprice, +row_number() over(partition by p_mfgr order by p_name) as rn, +rank() over(partition by p_mfgr order by p_name) as r, +dense_rank() over(partition by p_mfgr order by p_name) as dr, +first_value(p_retailprice) over(partition by p_mfgr order by p_name) as fv, +last_value(p_retailprice) over(partition by p_mfgr order by p_name) as lv, +count(p_retailprice) over(partition by p_mfgr order by p_name) as c, +count(*) over(partition by p_mfgr order by p_name) as cs +from vector_ptf_part_simple_orc; + + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +row_number() over(partition by p_mfgr order by p_name range between unbounded preceding and unbounded following) as rn, +rank() over(partition by p_mfgr order by p_name range between unbounded preceding and unbounded following) as r, +dense_rank() over(partition by p_mfgr order by p_name range between unbounded preceding and unbounded following) as dr, +first_value(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as fv, +last_value(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as lv, +count(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as c, +count(*) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as cs +from vector_ptf_part_simple_orc; + +select p_mfgr,p_name, p_retailprice, +row_number() over(partition by p_mfgr order by p_name range between unbounded preceding and unbounded following) as rn, +rank() over(partition by p_mfgr order by p_name range between unbounded preceding and unbounded following) as r, +dense_rank() over(partition by p_mfgr order by p_name range between unbounded preceding and unbounded following) as dr, +first_value(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as fv, +last_value(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as lv, +count(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as c, +count(*) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as cs +from vector_ptf_part_simple_orc; + + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr) as s, +min(p_retailprice) over(partition by p_mfgr) as mi, +max(p_retailprice) over(partition by p_mfgr) as ma, +avg(p_retailprice) over(partition by p_mfgr) as av +from vector_ptf_part_simple_orc; + +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr) as s, +min(p_retailprice) over(partition by p_mfgr) as mi, +max(p_retailprice) over(partition by p_mfgr) as ma, +avg(p_retailprice) over(partition by p_mfgr) as av +from vector_ptf_part_simple_orc; + + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name) as av +from vector_ptf_part_simple_orc; + +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name) as av +from vector_ptf_part_simple_orc; + + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as av +from vector_ptf_part_simple_orc; + +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name range between unbounded preceding and current row) as av +from vector_ptf_part_simple_orc; + + +-- +-- ROW +-- +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as av +from vector_ptf_part_simple_orc; + +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name rows between unbounded preceding and current row) as av +from vector_ptf_part_simple_orc; + + +create table vector_ptf_part_simple_text_decimal(p_mfgr string, p_name string, p_retailprice decimal(38,18)) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../../data/files/vector_ptf_part_simple.txt' OVERWRITE INTO TABLE vector_ptf_part_simple_text_decimal; + +create table vector_ptf_part_simple_orc_decimal(p_mfgr string, p_name string, p_retailprice decimal(38,18)) stored as orc; +INSERT INTO TABLE vector_ptf_part_simple_orc_decimal SELECT * FROM vector_ptf_part_simple_text_decimal; + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr) as s, +min(p_retailprice) over(partition by p_mfgr) as mi, +max(p_retailprice) over(partition by p_mfgr) as ma, +avg(p_retailprice) over(partition by p_mfgr) as av +from vector_ptf_part_simple_orc_decimal; + +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr) as s, +min(p_retailprice) over(partition by p_mfgr) as mi, +max(p_retailprice) over(partition by p_mfgr) as ma, +avg(p_retailprice) over(partition by p_mfgr) as av +from vector_ptf_part_simple_orc_decimal; + + +explain vectorization detail +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name) as av +from vector_ptf_part_simple_orc_decimal; + +select p_mfgr,p_name, p_retailprice, +sum(p_retailprice) over(partition by p_mfgr order by p_name) as s, +min(p_retailprice) over(partition by p_mfgr order by p_name) as mi, +max(p_retailprice) over(partition by p_mfgr order by p_name) as ma, +avg(p_retailprice) over(partition by p_mfgr order by p_name) as av +from vector_ptf_part_simple_orc_decimal; + + + + +create table vector_ptf_part_simple_orc_long(p_mfgr string, p_name string, p_bigint bigint) stored as orc; +INSERT INTO TABLE vector_ptf_part_simple_orc_long SELECT p_mfgr, p_name, cast(p_retailprice * 100 as bigint) FROM vector_ptf_part_simple_text_decimal; + +explain vectorization detail +select p_mfgr,p_name, p_bigint, +sum(p_bigint) over(partition by p_mfgr) as s, +min(p_bigint) over(partition by p_mfgr) as mi, +max(p_bigint) over(partition by p_mfgr) as ma, +avg(p_bigint) over(partition by p_mfgr) as av +from vector_ptf_part_simple_orc_long; + +select p_mfgr,p_name, p_bigint, +sum(p_bigint) over(partition by p_mfgr) as s, +min(p_bigint) over(partition by p_mfgr) as mi, +max(p_bigint) over(partition by p_mfgr) as ma, +avg(p_bigint) over(partition by p_mfgr) as av +from vector_ptf_part_simple_orc_long; + + +explain vectorization detail +select p_mfgr,p_name, p_bigint, +sum(p_bigint) over(partition by p_mfgr order by p_name) as s, +min(p_bigint) over(partition by p_mfgr order by p_name) as mi, +max(p_bigint) over(partition by p_mfgr order by p_name) as ma, +avg(p_bigint) over(partition by p_mfgr order by p_name) as av +from vector_ptf_part_simple_orc_long; + +select p_mfgr,p_name, p_bigint, +sum(p_bigint) over(partition by p_mfgr order by p_name) as s, +min(p_bigint) over(partition by p_mfgr order by p_name) as mi, +max(p_bigint) over(partition by p_mfgr order by p_name) as ma, +avg(p_bigint) over(partition by p_mfgr order by p_name) as av +from vector_ptf_part_simple_orc_long; + + +-- Omit p_name columns + +explain vectorization detail +select p_mfgr, p_retailprice, +rank() over(partition by p_mfgr) as r +from vector_ptf_part_simple_orc; + +select p_mfgr, p_retailprice, +rank() over(partition by p_mfgr) as r +from vector_ptf_part_simple_orc; + + +explain vectorization detail +select p_mfgr, p_retailprice, +rank() over(partition by p_mfgr order by p_name) as r +from vector_ptf_part_simple_orc; + +select p_mfgr, p_retailprice, +rank() over(partition by p_mfgr order by p_name) as r +from vector_ptf_part_simple_orc; + + +-- Calculated partition key + +explain vectorization detail +select p_mfgr, p_name, p_retailprice, +rank() over(partition by p_mfgr, case when p_mfgr == "Manufacturer#2" then timestamp "2000-01-01 00:00:00" end) as r +from vector_ptf_part_simple_orc; + +select p_mfgr, p_name, p_retailprice, +rank() over(partition by p_mfgr, case when p_mfgr == "Manufacturer#2" then timestamp "2000-01-01 00:00:00" end) as r +from vector_ptf_part_simple_orc; + +explain vectorization detail +select p_mfgr, p_name, p_retailprice, +rank() over(partition by p_mfgr, case when p_mfgr == "Manufacturer#2" then timestamp "2000-01-01 00:00:00" end order by p_name) as r +from vector_ptf_part_simple_orc; + +select p_mfgr, p_name, p_retailprice, +rank() over(partition by p_mfgr, case when p_mfgr == "Manufacturer#2" then timestamp "2000-01-01 00:00:00" end order by p_name) as r +from vector_ptf_part_simple_orc; diff --git ql/src/test/queries/clientpositive/vector_windowing.q ql/src/test/queries/clientpositive/vector_windowing.q new file mode 100644 index 0000000..c042157 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing.q @@ -0,0 +1,790 @@ +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +set hive.mapred.mode=nonstrict; +set mapred.reduce.tasks=4; +-- SORT_QUERY_RESULTS + +-- 1. testWindowing +explain vectorization detail +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1 +from part +; +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1 +from part +; + +-- 2. testGroupByWithPartitioning +explain vectorization detail +select p_mfgr, p_name, p_size, +min(p_retailprice), +rank() over(distribute by p_mfgr sort by p_name)as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +group by p_mfgr, p_name, p_size +; +select p_mfgr, p_name, p_size, +min(p_retailprice), +rank() over(distribute by p_mfgr sort by p_name)as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +group by p_mfgr, p_name, p_size +; + +-- 3. testGroupByHavingWithSWQ +explain vectorization detail +select p_mfgr, p_name, p_size, min(p_retailprice), +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +group by p_mfgr, p_name, p_size +having p_size > 0 +; +select p_mfgr, p_name, p_size, min(p_retailprice), +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +group by p_mfgr, p_name, p_size +having p_size > 0 +; + +-- 4. testCount +explain vectorization detail +select p_mfgr, p_name, +count(p_size) over(distribute by p_mfgr sort by p_name) as cd +from part +; +select p_mfgr, p_name, +count(p_size) over(distribute by p_mfgr sort by p_name) as cd +from part +; + +-- 5. testCountWithWindowingUDAF +explain vectorization detail +select p_mfgr, p_name, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +count(p_size) over(distribute by p_mfgr sort by p_name) as cd, +p_retailprice, round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +; +select p_mfgr, p_name, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +count(p_size) over(distribute by p_mfgr sort by p_name) as cd, +p_retailprice, round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +; + +-- 6. testCountInSubQ +explain vectorization detail +select sub1.r, sub1.dr, sub1.cd, sub1.s1, sub1.deltaSz +from (select p_mfgr, p_name, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +count(p_size) over(distribute by p_mfgr sort by p_name) as cd, +p_retailprice, round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +) sub1; +select sub1.r, sub1.dr, sub1.cd, sub1.s1, sub1.deltaSz +from (select p_mfgr, p_name, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +count(p_size) over(distribute by p_mfgr sort by p_name) as cd, +p_retailprice, round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +) sub1; + +-- 7. testJoinWithWindowingAndPTF +explain vectorization detail +select abc.p_mfgr, abc.p_name, +rank() over(distribute by abc.p_mfgr sort by abc.p_name) as r, +dense_rank() over(distribute by abc.p_mfgr sort by abc.p_name) as dr, +abc.p_retailprice, round(sum(abc.p_retailprice) over (distribute by abc.p_mfgr sort by abc.p_name rows between unbounded preceding and current row),2) as s1, +abc.p_size, abc.p_size - lag(abc.p_size,1,abc.p_size) over(distribute by abc.p_mfgr sort by abc.p_name) as deltaSz +from noop(on part +partition by p_mfgr +order by p_name +) abc join part p1 on abc.p_partkey = p1.p_partkey +; +select abc.p_mfgr, abc.p_name, +rank() over(distribute by abc.p_mfgr sort by abc.p_name) as r, +dense_rank() over(distribute by abc.p_mfgr sort by abc.p_name) as dr, +abc.p_retailprice, round(sum(abc.p_retailprice) over (distribute by abc.p_mfgr sort by abc.p_name rows between unbounded preceding and current row),2) as s1, +abc.p_size, abc.p_size - lag(abc.p_size,1,abc.p_size) over(distribute by abc.p_mfgr sort by abc.p_name) as deltaSz +from noop(on part +partition by p_mfgr +order by p_name +) abc join part p1 on abc.p_partkey = p1.p_partkey +; + +-- 8. testMixedCaseAlias +explain vectorization detail +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name, p_size desc) as R +from part +; +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name, p_size desc) as R +from part +; + +-- 9. testHavingWithWindowingNoGBY +explain vectorization detail +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1 +from part +; +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1 +from part +; + +-- 10. testHavingWithWindowingCondRankNoGBY +explain vectorization detail +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1 +from part +; +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s1 +from part +; + +-- 11. testFirstLast +explain vectorization detail +select p_mfgr,p_name, p_size, +sum(p_size) over (distribute by p_mfgr sort by p_name rows between current row and current row) as s2, +first_value(p_size) over w1 as f, +last_value(p_size, false) over w1 as l +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +sum(p_size) over (distribute by p_mfgr sort by p_name rows between current row and current row) as s2, +first_value(p_size) over w1 as f, +last_value(p_size, false) over w1 as l +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); + +-- 12. testFirstLastWithWhere +explain vectorization detail +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +sum(p_size) over (distribute by p_mfgr sort by p_name rows between current row and current row) as s2, +first_value(p_size) over w1 as f, +last_value(p_size, false) over w1 as l +from part +where p_mfgr = 'Manufacturer#3' +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +sum(p_size) over (distribute by p_mfgr sort by p_name rows between current row and current row) as s2, +first_value(p_size) over w1 as f, +last_value(p_size, false) over w1 as l +from part +where p_mfgr = 'Manufacturer#3' +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); + +-- 13. testSumWindow +explain vectorization detail +select p_mfgr,p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over (distribute by p_mfgr sort by p_name rows between current row and current row) as s2 +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over (distribute by p_mfgr sort by p_name rows between current row and current row) as s2 +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); + +-- 14. testNoSortClause +explain vectorization detail +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, dense_rank() over(distribute by p_mfgr sort by p_name) as dr +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, dense_rank() over(distribute by p_mfgr sort by p_name) as dr +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); + +-- 15. testExpressions +explain vectorization detail +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +cume_dist() over(distribute by p_mfgr sort by p_name) as cud, +percent_rank() over(distribute by p_mfgr sort by p_name) as pr, +ntile(3) over(distribute by p_mfgr sort by p_name) as nt, +count(p_size) over(distribute by p_mfgr sort by p_name) as ca, +avg(p_size) over(distribute by p_mfgr sort by p_name) as avg, +stddev(p_size) over(distribute by p_mfgr sort by p_name) as st, +first_value(p_size % 5) over(distribute by p_mfgr sort by p_name) as fv, +last_value(p_size) over(distribute by p_mfgr sort by p_name) as lv, +first_value(p_size) over w1 as fvW1 +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +cume_dist() over(distribute by p_mfgr sort by p_name) as cud, +percent_rank() over(distribute by p_mfgr sort by p_name) as pr, +ntile(3) over(distribute by p_mfgr sort by p_name) as nt, +count(p_size) over(distribute by p_mfgr sort by p_name) as ca, +avg(p_size) over(distribute by p_mfgr sort by p_name) as avg, +stddev(p_size) over(distribute by p_mfgr sort by p_name) as st, +first_value(p_size % 5) over(distribute by p_mfgr sort by p_name) as fv, +last_value(p_size) over(distribute by p_mfgr sort by p_name) as lv, +first_value(p_size) over w1 as fvW1 +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +-- 16. testMultipleWindows +explain vectorization detail +select p_mfgr,p_name, p_size, + rank() over(distribute by p_mfgr sort by p_name) as r, + dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +cume_dist() over(distribute by p_mfgr sort by p_name) as cud, +sum(p_size) over (distribute by p_mfgr sort by p_name range between unbounded preceding and current row) as s1, +sum(p_size) over (distribute by p_mfgr sort by p_size range between 5 preceding and current row) as s2, +first_value(p_size) over w1 as fv1 +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, + rank() over(distribute by p_mfgr sort by p_name) as r, + dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +cume_dist() over(distribute by p_mfgr sort by p_name) as cud, +sum(p_size) over (distribute by p_mfgr sort by p_name range between unbounded preceding and current row) as s1, +sum(p_size) over (distribute by p_mfgr sort by p_size range between 5 preceding and current row) as s2, +first_value(p_size) over w1 as fv1 +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +-- 17. testCountStar +explain vectorization detail +select p_mfgr,p_name, p_size, +count(*) over(distribute by p_mfgr sort by p_name ) as c, +count(p_size) over(distribute by p_mfgr sort by p_name) as ca, +first_value(p_size) over w1 as fvW1 +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +count(*) over(distribute by p_mfgr sort by p_name ) as c, +count(p_size) over(distribute by p_mfgr sort by p_name) as ca, +first_value(p_size) over w1 as fvW1 +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +-- 18. testUDAFs +explain vectorization detail +select p_mfgr,p_name, p_size, +round(sum(p_retailprice) over w1,2) as s, +min(p_retailprice) over w1 as mi, +max(p_retailprice) over w1 as ma, +round(avg(p_retailprice) over w1,2) as ag +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +round(sum(p_retailprice) over w1,2) as s, +min(p_retailprice) over w1 as mi, +max(p_retailprice) over w1 as ma, +round(avg(p_retailprice) over w1,2) as ag +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +-- 19. testUDAFsWithGBY +explain vectorization detail +select p_mfgr,p_name, p_size, p_retailprice, +round(sum(p_retailprice) over w1,2) as s, +min(p_retailprice) as mi , +max(p_retailprice) as ma , +round(avg(p_retailprice) over w1,2) as ag +from part +group by p_mfgr,p_name, p_size, p_retailprice +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, p_retailprice, +round(sum(p_retailprice) over w1,2) as s, +min(p_retailprice) as mi , +max(p_retailprice) as ma , +round(avg(p_retailprice) over w1,2) as ag +from part +group by p_mfgr,p_name, p_size, p_retailprice +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +-- 20. testSTATs +explain vectorization detail +select p_mfgr,p_name, p_size, +stddev(p_retailprice) over w1 as sdev, +stddev_pop(p_retailprice) over w1 as sdev_pop, +collect_set(p_size) over w1 as uniq_size, +variance(p_retailprice) over w1 as var, +round(corr(p_size, p_retailprice) over w1,5) as cor, +covar_pop(p_size, p_retailprice) over w1 as covarp +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +stddev(p_retailprice) over w1 as sdev, +stddev_pop(p_retailprice) over w1 as sdev_pop, +collect_set(p_size) over w1 as uniq_size, +variance(p_retailprice) over w1 as var, +round(corr(p_size, p_retailprice) over w1,5) as cor, +covar_pop(p_size, p_retailprice) over w1 as covarp +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +-- 21. testDISTs +explain vectorization detail +select p_mfgr,p_name, p_size, +histogram_numeric(p_retailprice, 5) over w1 as hist, +percentile(p_partkey, 0.5) over w1 as per, +row_number() over(distribute by p_mfgr sort by p_mfgr, p_name) as rn +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +histogram_numeric(p_retailprice, 5) over w1 as hist, +percentile(p_partkey, 0.5) over w1 as per, +row_number() over(distribute by p_mfgr sort by p_mfgr, p_name) as rn +from part +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +-- 22. testViewAsTableInputWithWindowing +explain vectorization detail +create view IF NOT EXISTS mfgr_price_view as +select p_mfgr, p_brand, +round(sum(p_retailprice),2) as s +from part +group by p_mfgr, p_brand; +create view IF NOT EXISTS mfgr_price_view as +select p_mfgr, p_brand, +round(sum(p_retailprice),2) as s +from part +group by p_mfgr, p_brand; + +explain vectorization detail +select * +from ( +select p_mfgr, p_brand, s, +round(sum(s) over w1 , 2) as s1 +from mfgr_price_view +window w1 as (distribute by p_mfgr sort by p_mfgr ) +) sq +order by p_mfgr, p_brand; +select * +from ( +select p_mfgr, p_brand, s, +round(sum(s) over w1 , 2) as s1 +from mfgr_price_view +window w1 as (distribute by p_mfgr sort by p_mfgr ) +) sq +order by p_mfgr, p_brand; + +select p_mfgr, p_brand, s, +round(sum(s) over w1 ,2) as s1 +from mfgr_price_view +window w1 as (distribute by p_mfgr sort by p_brand rows between 2 preceding and current row); + +-- 23. testCreateViewWithWindowingQuery +explain vectorization detail +create view IF NOT EXISTS mfgr_brand_price_view as +select p_mfgr, p_brand, +round(sum(p_retailprice) over w1,2) as s +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and current row); +create view IF NOT EXISTS mfgr_brand_price_view as +select p_mfgr, p_brand, +round(sum(p_retailprice) over w1,2) as s +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and current row); + +explain vectorization detail +select * from mfgr_brand_price_view; +select * from mfgr_brand_price_view; + +-- 24. testLateralViews +explain vectorization detail +select p_mfgr, p_name, +lv_col, p_size, sum(p_size) over w1 as s +from (select p_mfgr, p_name, p_size, array(1,2,3) arr from part) p +lateral view explode(arr) part_lv as lv_col +window w1 as (distribute by p_mfgr sort by p_size, lv_col rows between 2 preceding and current row); +select p_mfgr, p_name, +lv_col, p_size, sum(p_size) over w1 as s +from (select p_mfgr, p_name, p_size, array(1,2,3) arr from part) p +lateral view explode(arr) part_lv as lv_col +window w1 as (distribute by p_mfgr sort by p_size, lv_col rows between 2 preceding and current row); + +-- 25. testMultipleInserts3SWQs +CREATE TABLE part_1( +p_mfgr STRING, +p_name STRING, +p_size INT, +r INT, +dr INT, +s DOUBLE); + +CREATE TABLE part_2( +p_mfgr STRING, +p_name STRING, +p_size INT, +r INT, +dr INT, +cud INT, +s2 DOUBLE, +fv1 INT); + +CREATE TABLE part_3( +p_mfgr STRING, +p_name STRING, +p_size INT, +c INT, +ca INT, +fv INT); + +explain vectorization detail +from part +INSERT OVERWRITE TABLE part_1 +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name ) as r, +dense_rank() over(distribute by p_mfgr sort by p_name ) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s +INSERT OVERWRITE TABLE part_2 +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +cume_dist() over(distribute by p_mfgr sort by p_name) as cud, +round(sum(p_size) over (distribute by p_mfgr sort by p_size range between 5 preceding and current row),1) as s2, +first_value(p_size) over w1 as fv1 +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following) +INSERT OVERWRITE TABLE part_3 +select p_mfgr,p_name, p_size, +count(*) over(distribute by p_mfgr sort by p_name) as c, +count(p_size) over(distribute by p_mfgr sort by p_name) as ca, +first_value(p_size) over w1 as fv +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); +from part +INSERT OVERWRITE TABLE part_1 +select p_mfgr, p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name ) as r, +dense_rank() over(distribute by p_mfgr sort by p_name ) as dr, +round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between unbounded preceding and current row),2) as s +INSERT OVERWRITE TABLE part_2 +select p_mfgr,p_name, p_size, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +cume_dist() over(distribute by p_mfgr sort by p_name) as cud, +round(sum(p_size) over (distribute by p_mfgr sort by p_size range between 5 preceding and current row),1) as s2, +first_value(p_size) over w1 as fv1 +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following) +INSERT OVERWRITE TABLE part_3 +select p_mfgr,p_name, p_size, +count(*) over(distribute by p_mfgr sort by p_name) as c, +count(p_size) over(distribute by p_mfgr sort by p_name) as ca, +first_value(p_size) over w1 as fv +window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 preceding and 2 following); + +select * from part_1; + +select * from part_2; + +select * from part_3; + +-- 26. testGroupByHavingWithSWQAndAlias +explain vectorization detail +select p_mfgr, p_name, p_size, min(p_retailprice) as mi, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +group by p_mfgr, p_name, p_size +having p_size > 0 +; +select p_mfgr, p_name, p_size, min(p_retailprice) as mi, +rank() over(distribute by p_mfgr sort by p_name) as r, +dense_rank() over(distribute by p_mfgr sort by p_name) as dr, +p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz +from part +group by p_mfgr, p_name, p_size +having p_size > 0 +; + +-- 27. testMultipleRangeWindows +explain vectorization detail +select p_mfgr,p_name, p_size, +sum(p_size) over (distribute by p_mfgr sort by p_size range between 10 preceding and current row) as s2, +sum(p_size) over (distribute by p_mfgr sort by p_size range between current row and 10 following ) as s1 +from part +window w1 as (rows between 2 preceding and 2 following); +select p_mfgr,p_name, p_size, +sum(p_size) over (distribute by p_mfgr sort by p_size range between 10 preceding and current row) as s2, +sum(p_size) over (distribute by p_mfgr sort by p_size range between current row and 10 following ) as s1 +from part +window w1 as (rows between 2 preceding and 2 following); + +-- 28. testPartOrderInUDAFInvoke +explain vectorization detail +select p_mfgr, p_name, p_size, +sum(p_size) over (partition by p_mfgr order by p_name rows between 2 preceding and 2 following) as s +from part; +select p_mfgr, p_name, p_size, +sum(p_size) over (partition by p_mfgr order by p_name rows between 2 preceding and 2 following) as s +from part; + +-- 29. testPartOrderInWdwDef +explain vectorization detail +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s +from part +window w1 as (partition by p_mfgr order by p_name rows between 2 preceding and 2 following); +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s +from part +window w1 as (partition by p_mfgr order by p_name rows between 2 preceding and 2 following); + +-- 30. testDefaultPartitioningSpecRules +explain vectorization detail +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s, +sum(p_size) over w2 as s2 +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following), + w2 as (partition by p_mfgr order by p_name); +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s, +sum(p_size) over w2 as s2 +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following), + w2 as (partition by p_mfgr order by p_name); + +-- 31. testWindowCrossReference +explain vectorization detail +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2 +from part +window w1 as (partition by p_mfgr order by p_name range between 2 preceding and 2 following), + w2 as w1; +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2 +from part +window w1 as (partition by p_mfgr order by p_name range between 2 preceding and 2 following), + w2 as w1; + + +-- 32. testWindowInheritance +explain vectorization detail +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2 +from part +window w1 as (partition by p_mfgr order by p_name range between 2 preceding and 2 following), + w2 as (w1 rows between unbounded preceding and current row); +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2 +from part +window w1 as (partition by p_mfgr order by p_name range between 2 preceding and 2 following), + w2 as (w1 rows between unbounded preceding and current row); + + +-- 33. testWindowForwardReference +explain vectorization detail +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2, +sum(p_size) over w3 as s3 +from part +window w1 as (distribute by p_mfgr sort by p_name range between 2 preceding and 2 following), + w2 as w3, + w3 as (distribute by p_mfgr sort by p_name range between unbounded preceding and current row); +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2, +sum(p_size) over w3 as s3 +from part +window w1 as (distribute by p_mfgr sort by p_name range between 2 preceding and 2 following), + w2 as w3, + w3 as (distribute by p_mfgr sort by p_name range between unbounded preceding and current row); + + +-- 34. testWindowDefinitionPropagation +explain vectorization detail +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2, +sum(p_size) over (w3 rows between 2 preceding and 2 following) as s3 +from part +window w1 as (distribute by p_mfgr sort by p_name range between 2 preceding and 2 following), + w2 as w3, + w3 as (distribute by p_mfgr sort by p_name range between unbounded preceding and current row); +select p_mfgr, p_name, p_size, +sum(p_size) over w1 as s1, +sum(p_size) over w2 as s2, +sum(p_size) over (w3 rows between 2 preceding and 2 following) as s3 +from part +window w1 as (distribute by p_mfgr sort by p_name range between 2 preceding and 2 following), + w2 as w3, + w3 as (distribute by p_mfgr sort by p_name range between unbounded preceding and current row); + +-- 35. testDistinctWithWindowing +explain vectorization detail +select DISTINCT p_mfgr, p_name, p_size, +sum(p_size) over w1 as s +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); + +select DISTINCT p_mfgr, p_name, p_size, +sum(p_size) over w1 as s +from part +window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following); + +-- 36. testRankWithPartitioning +explain vectorization detail +select p_mfgr, p_name, p_size, +rank() over (partition by p_mfgr order by p_name ) as r +from part; +select p_mfgr, p_name, p_size, +rank() over (partition by p_mfgr order by p_name ) as r +from part; + +-- 37. testPartitioningVariousForms +explain vectorization detail +select p_mfgr, +round(sum(p_retailprice) over (partition by p_mfgr order by p_mfgr),2) as s1, +min(p_retailprice) over (partition by p_mfgr) as s2, +max(p_retailprice) over (distribute by p_mfgr sort by p_mfgr) as s3, +round(avg(p_retailprice) over (distribute by p_mfgr),2) as s4, +count(p_retailprice) over (cluster by p_mfgr ) as s5 +from part; +select p_mfgr, +round(sum(p_retailprice) over (partition by p_mfgr order by p_mfgr),2) as s1, +min(p_retailprice) over (partition by p_mfgr) as s2, +max(p_retailprice) over (distribute by p_mfgr sort by p_mfgr) as s3, +round(avg(p_retailprice) over (distribute by p_mfgr),2) as s4, +count(p_retailprice) over (cluster by p_mfgr ) as s5 +from part; + +-- 38. testPartitioningVariousForms2 +explain vectorization detail +select p_mfgr, p_name, p_size, +round(sum(p_retailprice) over (partition by p_mfgr, p_name order by p_mfgr, p_name rows between unbounded preceding and current row),2) as s1, +min(p_retailprice) over (distribute by p_mfgr, p_name sort by p_mfgr, p_name rows between unbounded preceding and current row) as s2, +max(p_retailprice) over (partition by p_mfgr, p_name order by p_name) as s3 +from part; +select p_mfgr, p_name, p_size, +round(sum(p_retailprice) over (partition by p_mfgr, p_name order by p_mfgr, p_name rows between unbounded preceding and current row),2) as s1, +min(p_retailprice) over (distribute by p_mfgr, p_name sort by p_mfgr, p_name rows between unbounded preceding and current row) as s2, +max(p_retailprice) over (partition by p_mfgr, p_name order by p_name) as s3 +from part; + +-- 39. testUDFOnOrderCols +explain vectorization detail +select p_mfgr, p_type, substr(p_type, 2) as short_ptype, +rank() over (partition by p_mfgr order by substr(p_type, 2)) as r +from part; +select p_mfgr, p_type, substr(p_type, 2) as short_ptype, +rank() over (partition by p_mfgr order by substr(p_type, 2)) as r +from part; + +-- 40. testNoBetweenForRows +explain vectorization detail +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows unbounded preceding),2) as s1 + from part ; +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows unbounded preceding),2) as s1 + from part ; + +-- 41. testNoBetweenForRange +explain vectorization detail +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_size range unbounded preceding),2) as s1 + from part ; + +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_size range unbounded preceding),2) as s1 + from part ; + +-- 42. testUnboundedFollowingForRows +explain vectorization detail +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between current row and unbounded following),2) as s1 + from part ; +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows between current row and unbounded following),2) as s1 + from part ; + +-- 43. testUnboundedFollowingForRange +explain vectorization detail +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_size range between current row and unbounded following),2) as s1 + from part ; +select p_mfgr, p_name, p_size, + round(sum(p_retailprice) over (distribute by p_mfgr sort by p_size range between current row and unbounded following),2) as s1 + from part ; + +-- 44. testOverNoPartitionSingleAggregate +explain vectorization detail +select p_name, p_retailprice, +round(avg(p_retailprice) over(),2) +from part +order by p_name; +select p_name, p_retailprice, +round(avg(p_retailprice) over(),2) +from part +order by p_name; + +-- 45. empty partition test +explain vectorization detail +select p_mfgr, + sum(p_size) over (partition by p_mfgr order by p_size rows between unbounded preceding and current row) +from part +where p_mfgr = 'Manufacturer#6' +; +select p_mfgr, + sum(p_size) over (partition by p_mfgr order by p_size rows between unbounded preceding and current row) +from part +where p_mfgr = 'Manufacturer#6' +; + +-- 46. window sz is same as partition sz +explain vectorization detail +select p_retailprice, round(avg(p_retailprice) over (partition by p_mfgr order by p_name rows between current row and 6 following),2), +round(sum(p_retailprice) over (partition by p_mfgr order by p_name rows between current row and 6 following),2) +from part +where p_mfgr='Manufacturer#1'; +select p_retailprice, round(avg(p_retailprice) over (partition by p_mfgr order by p_name rows between current row and 6 following),2), +round(sum(p_retailprice) over (partition by p_mfgr order by p_name rows between current row and 6 following),2) +from part +where p_mfgr='Manufacturer#1'; + +-- 47. empty partition +explain vectorization detail +select sum(p_size) over (partition by p_mfgr ) +from part where p_mfgr = 'm1'; +select sum(p_size) over (partition by p_mfgr ) +from part where p_mfgr = 'm1'; diff --git ql/src/test/queries/clientpositive/vector_windowing_expressions.q ql/src/test/queries/clientpositive/vector_windowing_expressions.q new file mode 100644 index 0000000..6654112 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_expressions.q @@ -0,0 +1,92 @@ +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table over10k; + +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../../data/files/over10k' into table over10k; + +explain vectorization detail +select p_mfgr, p_retailprice, p_size, +round(sum(p_retailprice) over w1 , 2) = round(sum(lag(p_retailprice,1,0.0)) over w1 + last_value(p_retailprice) over w1 , 2), +max(p_retailprice) over w1 - min(p_retailprice) over w1 = last_value(p_retailprice) over w1 - first_value(p_retailprice) over w1 +from part +window w1 as (distribute by p_mfgr sort by p_retailprice) +; +select p_mfgr, p_retailprice, p_size, +round(sum(p_retailprice) over w1 , 2) = round(sum(lag(p_retailprice,1,0.0)) over w1 + last_value(p_retailprice) over w1 , 2), +max(p_retailprice) over w1 - min(p_retailprice) over w1 = last_value(p_retailprice) over w1 - first_value(p_retailprice) over w1 +from part +window w1 as (distribute by p_mfgr sort by p_retailprice) +; + +explain vectorization detail +select p_mfgr, p_retailprice, p_size, +rank() over (distribute by p_mfgr sort by p_retailprice) as r, +sum(p_retailprice) over (distribute by p_mfgr sort by p_retailprice rows between unbounded preceding and current row) as s2, +sum(p_retailprice) over (distribute by p_mfgr sort by p_retailprice rows between unbounded preceding and current row) -5 as s1 +from part +; +select p_mfgr, p_retailprice, p_size, +rank() over (distribute by p_mfgr sort by p_retailprice) as r, +sum(p_retailprice) over (distribute by p_mfgr sort by p_retailprice rows between unbounded preceding and current row) as s2, +sum(p_retailprice) over (distribute by p_mfgr sort by p_retailprice rows between unbounded preceding and current row) -5 as s1 +from part +; + +explain vectorization detail +select s, si, f, si - lead(f, 3) over (partition by t order by bo,s,si,f desc) from over10k limit 100; +select s, si, f, si - lead(f, 3) over (partition by t order by bo,s,si,f desc) from over10k limit 100; +explain vectorization detail +select s, i, i - lead(i, 3, 0) over (partition by si order by i,s) from over10k limit 100; +select s, i, i - lead(i, 3, 0) over (partition by si order by i,s) from over10k limit 100; +explain vectorization detail +select s, si, d, si - lag(d, 3) over (partition by b order by si,s,d) from over10k limit 100; +select s, si, d, si - lag(d, 3) over (partition by b order by si,s,d) from over10k limit 100; +explain vectorization detail +select s, lag(s, 3, 'fred') over (partition by f order by b) from over10k limit 100; +select s, lag(s, 3, 'fred') over (partition by f order by b) from over10k limit 100; + +explain vectorization detail +select p_mfgr, avg(p_retailprice) over(partition by p_mfgr, p_type order by p_mfgr) from part; +select p_mfgr, avg(p_retailprice) over(partition by p_mfgr, p_type order by p_mfgr) from part; + +explain vectorization detail +select p_mfgr, avg(p_retailprice) over(partition by p_mfgr order by p_type,p_mfgr rows between unbounded preceding and current row) from part; +select p_mfgr, avg(p_retailprice) over(partition by p_mfgr order by p_type,p_mfgr rows between unbounded preceding and current row) from part; + +-- multi table insert test +create table t1 (a1 int, b1 string); +create table t2 (a1 int, b1 string); +explain vectorization detail +from (select sum(i) over (partition by ts order by i), s from over10k) tt insert overwrite table t1 select * insert overwrite table t2 select * ; +from (select sum(i) over (partition by ts order by i), s from over10k) tt insert overwrite table t1 select * insert overwrite table t2 select * ; +select * from t1 limit 3; +select * from t2 limit 3; + +explain vectorization detail +select p_mfgr, p_retailprice, p_size, +round(sum(p_retailprice) over w1 , 2) + 50.0 = round(sum(lag(p_retailprice,1,50.0)) over w1 + (last_value(p_retailprice) over w1),2) +from part +window w1 as (distribute by p_mfgr sort by p_retailprice) +limit 11; +select p_mfgr, p_retailprice, p_size, +round(sum(p_retailprice) over w1 , 2) + 50.0 = round(sum(lag(p_retailprice,1,50.0)) over w1 + (last_value(p_retailprice) over w1),2) +from part +window w1 as (distribute by p_mfgr sort by p_retailprice) +limit 11; diff --git ql/src/test/queries/clientpositive/vector_windowing_gby.q ql/src/test/queries/clientpositive/vector_windowing_gby.q new file mode 100644 index 0000000..7fd18aa --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_gby.q @@ -0,0 +1,19 @@ +set hive.explain.user=false; +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +set hive.mapred.mode=nonstrict; + +explain vectorization detail + select rank() over (order by return_ratio) as return_rank from + (select sum(wr.cint)/sum(ws.c_int) as return_ratio + from cbo_t3 ws join alltypesorc wr on ws.value = wr.cstring1 + group by ws.c_boolean ) in_web +; + + select rank() over (order by return_ratio) as return_rank from + (select sum(wr.cint)/sum(ws.c_int) as return_ratio + from cbo_t3 ws join alltypesorc wr on ws.value = wr.cstring1 + group by ws.c_boolean ) in_web +; diff --git ql/src/test/queries/clientpositive/vector_windowing_gby2.q ql/src/test/queries/clientpositive/vector_windowing_gby2.q new file mode 100644 index 0000000..8025b5d --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_gby2.q @@ -0,0 +1,46 @@ +set hive.explain.user=false; +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +set hive.mapred.mode=nonstrict; + +explain vectorization detail +select rank() over (order by sum(ws.c_int)) as return_rank +from cbo_t3 ws +group by ws.key; + +select rank() over (order by sum(ws.c_int)) as return_rank +from cbo_t3 ws +group by ws.key; + +explain vectorization detail +select avg(cast(ws.key as int)) over (partition by min(ws.value) order by sum(ws.c_int)) as return_rank +from cbo_t3 ws +group by cast(ws.key as int); + +select avg(cast(ws.key as int)) over (partition by min(ws.value) order by sum(ws.c_int)) as return_rank +from cbo_t3 ws +group by cast(ws.key as int); + +explain vectorization detail +select rank () over(partition by key order by sum(c_int - c_float) desc) , +dense_rank () over(partition by lower(value) order by sum(c_float/c_int) asc), +percent_rank () over(partition by max(c_int) order by sum((c_float/c_int) - c_int) asc) +from cbo_t3 +group by key, value; + +select rank () over(partition by key order by sum(c_int - c_float) desc) , +dense_rank () over(partition by lower(value) order by sum(c_float/c_int) asc), +percent_rank () over(partition by max(c_int) order by sum((c_float/c_int) - c_int) asc) +from cbo_t3 +group by key, value; + +explain vectorization detail +select rank() over (order by sum(wr.cint)/sum(ws.c_int)) as return_rank +from cbo_t3 ws join alltypesorc wr on ws.value = wr.cstring1 +group by ws.c_boolean; + +select rank() over (order by sum(wr.cint)/sum(ws.c_int)) as return_rank +from cbo_t3 ws join alltypesorc wr on ws.value = wr.cstring1 +group by ws.c_boolean; diff --git ql/src/test/queries/clientpositive/vector_windowing_multipartitioning.q ql/src/test/queries/clientpositive/vector_windowing_multipartitioning.q new file mode 100644 index 0000000..1cefd78 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_multipartitioning.q @@ -0,0 +1,71 @@ +set hive.explain.user=false; +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table over10k; + +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../../data/files/over10k' into table over10k; + +explain vectorization detail +select s, rank() over (partition by s order by si), sum(b) over (partition by s order by si) from over10k limit 100; +select s, rank() over (partition by s order by si), sum(b) over (partition by s order by si) from over10k limit 100; + +explain vectorization detail +select s, +rank() over (partition by s order by `dec` desc), +sum(b) over (partition by s order by ts desc) +from over10k +where s = 'tom allen' or s = 'bob steinbeck'; +select s, +rank() over (partition by s order by `dec` desc), +sum(b) over (partition by s order by ts desc) +from over10k +where s = 'tom allen' or s = 'bob steinbeck'; + +explain vectorization detail +select s, sum(i) over (partition by s), sum(f) over (partition by si) from over10k where s = 'tom allen' or s = 'bob steinbeck' ; +select s, sum(i) over (partition by s), sum(f) over (partition by si) from over10k where s = 'tom allen' or s = 'bob steinbeck' ; + +explain vectorization detail +select s, rank() over (partition by s order by bo), rank() over (partition by si order by bin desc) from over10k +where s = 'tom allen' or s = 'bob steinbeck'; +select s, rank() over (partition by s order by bo), rank() over (partition by si order by bin desc) from over10k +where s = 'tom allen' or s = 'bob steinbeck'; + +explain vectorization detail +select s, sum(f) over (partition by i), row_number() over (order by f) from over10k where s = 'tom allen' or s = 'bob steinbeck'; +select s, sum(f) over (partition by i), row_number() over (order by f) from over10k where s = 'tom allen' or s = 'bob steinbeck'; + +explain vectorization detail +select s, rank() over w1, +rank() over w2 +from over10k +where s = 'tom allen' or s = 'bob steinbeck' +window +w1 as (partition by s order by `dec`), +w2 as (partition by si order by f) +; +select s, rank() over w1, +rank() over w2 +from over10k +where s = 'tom allen' or s = 'bob steinbeck' +window +w1 as (partition by s order by `dec`), +w2 as (partition by si order by f) +; diff --git ql/src/test/queries/clientpositive/vector_windowing_order_null.q ql/src/test/queries/clientpositive/vector_windowing_order_null.q new file mode 100644 index 0000000..5098a55 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_order_null.q @@ -0,0 +1,56 @@ +set hive.explain.user=false; +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table over10k; + +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal, + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../../data/files/over10k' into table over10k; +load data local inpath '../../data/files/over4_null' into table over10k; + +explain vectorization detail +select i, s, b, sum(b) over (partition by i order by s nulls last,b rows unbounded preceding) from over10k limit 10; +select i, s, b, sum(b) over (partition by i order by s nulls last,b rows unbounded preceding) from over10k limit 10; + +explain vectorization detail +select d, s, f, sum(f) over (partition by d order by s,f desc nulls first rows unbounded preceding) from over10k limit 10; +select d, s, f, sum(f) over (partition by d order by s,f desc nulls first rows unbounded preceding) from over10k limit 10; + +explain vectorization detail +select ts, s, f, sum(f) over (partition by ts order by f asc nulls first range between current row and unbounded following) from over10k limit 10; +select ts, s, f, sum(f) over (partition by ts order by f asc nulls first range between current row and unbounded following) from over10k limit 10; + +explain vectorization detail +select t, s, d, avg(d) over (partition by t order by s,d desc nulls first rows between 5 preceding and 5 following) from over10k limit 10; +select t, s, d, avg(d) over (partition by t order by s,d desc nulls first rows between 5 preceding and 5 following) from over10k limit 10; + +explain vectorization detail +select ts, s, sum(i) over(partition by ts order by s nulls last) from over10k limit 10 offset 3; +select ts, s, sum(i) over(partition by ts order by s nulls last) from over10k limit 10 offset 3; + +explain vectorization detail +select s, i, round(sum(d) over (partition by s order by i desc nulls last) , 3) from over10k limit 5; +select s, i, round(sum(d) over (partition by s order by i desc nulls last) , 3) from over10k limit 5; + +explain vectorization detail +select s, i, round(avg(d) over (partition by s order by i desc nulls last) / 10.0 , 3) from over10k limit 5; +select s, i, round(avg(d) over (partition by s order by i desc nulls last) / 10.0 , 3) from over10k limit 5; + +explain vectorization detail +select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),3) from over10k window w1 as (partition by s order by i nulls last) limit 5; +select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),3) from over10k window w1 as (partition by s order by i nulls last) limit 5; diff --git ql/src/test/queries/clientpositive/vector_windowing_range_multiorder.q ql/src/test/queries/clientpositive/vector_windowing_range_multiorder.q new file mode 100644 index 0000000..6206b22 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_range_multiorder.q @@ -0,0 +1,66 @@ +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table over10k; + +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../../data/files/over10k' into table over10k; + +explain vectorization detail +select first_value(t) over ( partition by si order by i, b ) from over10k limit 100; +select first_value(t) over ( partition by si order by i, b ) from over10k limit 100; + +explain vectorization detail +select last_value(i) over (partition by si, bo order by i, f desc range current row) from over10k limit 100; +select last_value(i) over (partition by si, bo order by i, f desc range current row) from over10k limit 100; + +explain vectorization detail +select row_number() over (partition by si, bo order by i, f desc range between unbounded preceding and unbounded following) from over10k limit 100; +select row_number() over (partition by si, bo order by i, f desc range between unbounded preceding and unbounded following) from over10k limit 100; + +explain vectorization detail +select s, si, i, avg(i) over (partition by s range between unbounded preceding and current row) from over10k limit 100; +select s, si, i, avg(i) over (partition by s range between unbounded preceding and current row) from over10k limit 100; + +explain vectorization detail +select s, si, i, avg(i) over (partition by s order by si, i range between unbounded preceding and current row) from over10k limit 100; +select s, si, i, avg(i) over (partition by s order by si, i range between unbounded preceding and current row) from over10k limit 100; + +explain vectorization detail +select s, si, i, min(i) over (partition by s order by si, i range between unbounded preceding and current row) from over10k limit 100; +select s, si, i, min(i) over (partition by s order by si, i range between unbounded preceding and current row) from over10k limit 100; + +explain vectorization detail +select s, si, i, avg(i) over (partition by s order by si, i desc range between unbounded preceding and current row) from over10k limit 100; +select s, si, i, avg(i) over (partition by s order by si, i desc range between unbounded preceding and current row) from over10k limit 100; + +explain vectorization detail +select si, bo, i, f, max(i) over (partition by si, bo order by i, f desc range between unbounded preceding and current row) from over10k limit 100; +select si, bo, i, f, max(i) over (partition by si, bo order by i, f desc range between unbounded preceding and current row) from over10k limit 100; + +explain vectorization detail +select bo, rank() over (partition by i order by bo nulls first, b nulls last range between unbounded preceding and unbounded following) from over10k limit 100; +select bo, rank() over (partition by i order by bo nulls first, b nulls last range between unbounded preceding and unbounded following) from over10k limit 100; + +explain vectorization detail +select CAST(s as CHAR(12)), rank() over (partition by i order by CAST(s as CHAR(12)) nulls last range between unbounded preceding and unbounded following) from over10k limit 100; +select CAST(s as CHAR(12)), rank() over (partition by i order by CAST(s as CHAR(12)) nulls last range between unbounded preceding and unbounded following) from over10k limit 100; + +explain vectorization detail +select CAST(s as VARCHAR(12)), rank() over (partition by i order by CAST(s as VARCHAR(12)) nulls last range between unbounded preceding and unbounded following) from over10k limit 100; +select CAST(s as VARCHAR(12)), rank() over (partition by i order by CAST(s as VARCHAR(12)) nulls last range between unbounded preceding and unbounded following) from over10k limit 100; diff --git ql/src/test/queries/clientpositive/vector_windowing_rank.q ql/src/test/queries/clientpositive/vector_windowing_rank.q new file mode 100644 index 0000000..324e3b6 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_rank.q @@ -0,0 +1,115 @@ +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table over10k; + +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../../data/files/over10k' into table over10k; + +explain vectorization detail +select s, rank() over (partition by f order by t) from over10k limit 100; +select s, rank() over (partition by f order by t) from over10k limit 100; + +explain vectorization detail +select s, dense_rank() over (partition by ts order by i,s desc) from over10k limit 100; +select s, dense_rank() over (partition by ts order by i,s desc) from over10k limit 100; + +explain vectorization detail +select s, cume_dist() over (partition by bo order by b,s) from over10k limit 100; +select s, cume_dist() over (partition by bo order by b,s) from over10k limit 100; + +explain vectorization detail +select s, percent_rank() over (partition by `dec` order by f) from over10k limit 100; +select s, percent_rank() over (partition by `dec` order by f) from over10k limit 100; + +-- If following tests fail, look for the comments in class PTFPPD::process() + +explain vectorization detail +select ts, `dec`, rnk +from + (select ts, `dec`, + rank() over (partition by ts order by `dec`) as rnk + from + (select other.ts, other.`dec` + from over10k other + join over10k on (other.b = over10k.b) + ) joined + ) ranked +where rnk = 1 limit 10; +select ts, `dec`, rnk +from + (select ts, `dec`, + rank() over (partition by ts order by `dec`) as rnk + from + (select other.ts, other.`dec` + from over10k other + join over10k on (other.b = over10k.b) + ) joined + ) ranked +where rnk = 1 limit 10; + +explain vectorization detail +select ts, `dec`, rnk +from + (select ts, `dec`, + rank() over (partition by ts) as rnk + from + (select other.ts, other.`dec` + from over10k other + join over10k on (other.b = over10k.b) + ) joined + ) ranked +where `dec` = 89.5 limit 10; +select ts, `dec`, rnk +from + (select ts, `dec`, + rank() over (partition by ts) as rnk + from + (select other.ts, other.`dec` + from over10k other + join over10k on (other.b = over10k.b) + ) joined + ) ranked +where `dec` = 89.5 limit 10; + +explain vectorization detail +select ts, `dec`, rnk +from + (select ts, `dec`, + rank() over (partition by ts order by `dec`) as rnk + from + (select other.ts, other.`dec` + from over10k other + join over10k on (other.b = over10k.b) + where other.t < 10 + ) joined + ) ranked +where rnk = 1 limit 10; +select ts, `dec`, rnk +from + (select ts, `dec`, + rank() over (partition by ts order by `dec`) as rnk + from + (select other.ts, other.`dec` + from over10k other + join over10k on (other.b = over10k.b) + where other.t < 10 + ) joined + ) ranked +where rnk = 1 limit 10; + diff --git ql/src/test/queries/clientpositive/vector_windowing_streaming.q ql/src/test/queries/clientpositive/vector_windowing_streaming.q new file mode 100644 index 0000000..1601eec --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_streaming.q @@ -0,0 +1,83 @@ +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table over10k; + +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../../data/files/over10k' into table over10k; + +set hive.limit.pushdown.memory.usage=.8; + +-- part tests +explain vectorization detail +select * +from ( select p_mfgr, rank() over(partition by p_mfgr order by p_name) r from part) a +; + +explain vectorization detail +select * +from ( select p_mfgr, rank() over(partition by p_mfgr order by p_name) r from part) a +where r < 4; + +select * +from ( select p_mfgr, rank() over(partition by p_mfgr order by p_name) r from part) a +where r < 4; + +select * +from ( select p_mfgr, rank() over(partition by p_mfgr order by p_name) r from part) a +where r < 2; + +-- over10k tests +explain vectorization detail +select * +from (select t, f, rank() over(partition by t order by f) r from over10k) a +where r < 6 and t < 5; + +select * +from (select t, f, rank() over(partition by t order by f) r from over10k) a +where r < 6 and t < 5; + +select * +from (select t, f, row_number() over(partition by t order by f) r from over10k) a +where r < 8 and t < 0; + +set hive.vectorized.execution.enabled=false; +set hive.limit.pushdown.memory.usage=0.8; + +explain vectorization detail +select * from (select ctinyint, cdouble, rank() over(partition by ctinyint order by cdouble) r from alltypesorc) a where r < 5; + +drop table if exists sB; +create table sB ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE as +select * from (select ctinyint, cdouble, rank() over(partition by ctinyint order by cdouble) r from alltypesorc) a where r < 5; + +select * from sB +where ctinyint is null; + +set hive.vectorized.execution.enabled=true; +set hive.limit.pushdown.memory.usage=0.8; +drop table if exists sD; + +explain vectorization detail +create table sD ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE as +select * from (select ctinyint, cdouble, rank() over(partition by ctinyint order by cdouble) r from alltypesorc) a where r < 5; +create table sD ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE as +select * from (select ctinyint, cdouble, rank() over(partition by ctinyint order by cdouble) r from alltypesorc) a where r < 5; + +select * from sD +where ctinyint is null; diff --git ql/src/test/queries/clientpositive/vector_windowing_windowspec.q ql/src/test/queries/clientpositive/vector_windowing_windowspec.q new file mode 100644 index 0000000..6fed729 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_windowspec.q @@ -0,0 +1,68 @@ +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table over10k; + +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal, + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../../data/files/over10k' into table over10k; + +explain vectorization detail +select s, sum(b) over (partition by i order by s,b rows unbounded preceding) from over10k limit 100; +select s, sum(b) over (partition by i order by s,b rows unbounded preceding) from over10k limit 100; + +explain vectorization detail +select s, sum(f) over (partition by d order by s,f rows unbounded preceding) from over10k limit 100; +select s, sum(f) over (partition by d order by s,f rows unbounded preceding) from over10k limit 100; + +explain vectorization detail +select s, sum(f) over (partition by ts order by f range between current row and unbounded following) from over10k limit 100; +select s, sum(f) over (partition by ts order by f range between current row and unbounded following) from over10k limit 100; + +explain vectorization detail +select s, avg(f) over (partition by ts order by s,f rows between current row and 5 following) from over10k limit 100; +select s, avg(f) over (partition by ts order by s,f rows between current row and 5 following) from over10k limit 100; + +explain vectorization detail +select s, avg(d) over (partition by t order by s,d desc rows between 5 preceding and 5 following) from over10k limit 100; +select s, avg(d) over (partition by t order by s,d desc rows between 5 preceding and 5 following) from over10k limit 100; + +explain vectorization detail +select s, sum(i) over(partition by ts order by s) from over10k limit 100; +select s, sum(i) over(partition by ts order by s) from over10k limit 100; + +explain vectorization detail +select f, sum(f) over (partition by ts order by f range between unbounded preceding and current row) from over10k limit 100; +select f, sum(f) over (partition by ts order by f range between unbounded preceding and current row) from over10k limit 100; + +explain vectorization detail +select f, sum(f) over (partition by ts order by f rows between 2 preceding and 1 preceding) from over10k limit 100; +select f, sum(f) over (partition by ts order by f rows between 2 preceding and 1 preceding) from over10k limit 100; + +explain vectorization detail +select s, i, round(avg(d) over (partition by s order by i) / 10.0 , 2) from over10k limit 7; +select s, i, round(avg(d) over (partition by s order by i) / 10.0 , 2) from over10k limit 7; + +explain vectorization detail +select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),2) from over10k window w1 as (partition by s order by i) limit 7; +select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),2) from over10k window w1 as (partition by s order by i) limit 7; + +set hive.cbo.enable=false; +-- HIVE-9228 +explain vectorization detail +select s, i from ( select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),2) from over10k window w1 as (partition by s order by i)) X limit 7; +select s, i from ( select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),2) from over10k window w1 as (partition by s order by i)) X limit 7; diff --git ql/src/test/queries/clientpositive/vector_windowing_windowspec4.q ql/src/test/queries/clientpositive/vector_windowing_windowspec4.q new file mode 100644 index 0000000..7d94195 --- /dev/null +++ ql/src/test/queries/clientpositive/vector_windowing_windowspec4.q @@ -0,0 +1,35 @@ +--Test small dataset with larger windowing + +set hive.cli.print.header=true; +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table if exists smalltable_windowing; + +create table smalltable_windowing( + i int, + type string); +insert into smalltable_windowing values(3, 'a'), (1, 'a'), (2, 'a'); + +explain vectorization detail +select type, i, +max(i) over (partition by type order by i rows between 1 preceding and 7 following), +min(i) over (partition by type order by i rows between 1 preceding and 7 following), +first_value(i) over (partition by type order by i rows between 1 preceding and 7 following), +last_value(i) over (partition by type order by i rows between 1 preceding and 7 following), +avg(i) over (partition by type order by i rows between 1 preceding and 7 following), +sum(i) over (partition by type order by i rows between 1 preceding and 7 following), +collect_set(i) over (partition by type order by i rows between 1 preceding and 7 following), +count(i) over (partition by type order by i rows between 1 preceding and 7 following) +from smalltable_windowing; + +select type, i, +max(i) over (partition by type order by i rows between 1 preceding and 7 following), +min(i) over (partition by type order by i rows between 1 preceding and 7 following), +first_value(i) over (partition by type order by i rows between 1 preceding and 7 following), +last_value(i) over (partition by type order by i rows between 1 preceding and 7 following), +avg(i) over (partition by type order by i rows between 1 preceding and 7 following), +sum(i) over (partition by type order by i rows between 1 preceding and 7 following), +collect_set(i) over (partition by type order by i rows between 1 preceding and 7 following), +count(i) over (partition by type order by i rows between 1 preceding and 7 following) +from smalltable_windowing; diff --git ql/src/test/queries/clientpositive/vectorized_ptf.q ql/src/test/queries/clientpositive/vectorized_ptf.q index 232aa11..dbc7ca6 100644 --- ql/src/test/queries/clientpositive/vectorized_ptf.q +++ ql/src/test/queries/clientpositive/vectorized_ptf.q @@ -43,7 +43,7 @@ insert into table part_orc select * from part_staging; --1. test1 -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -64,7 +64,7 @@ from noop(on part_orc -- 2. testJoinWithNoop -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, p_size - lag(p_size,1,p_size) over (partition by p_mfgr order by p_name) as deltaSz from noop (on (select p1.* from part_orc p1 join part_orc p2 on p1.p_partkey = p2.p_partkey) j @@ -81,7 +81,7 @@ sort by j.p_name) -- 3. testOnlyPTF -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size from noop(on part_orc partition by p_mfgr @@ -94,7 +94,7 @@ order by p_name); -- 4. testPTFAlias -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -115,7 +115,7 @@ from noop(on part_orc -- 5. testPTFAndWhereWithWindowing -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -138,7 +138,7 @@ from noop(on part_orc -- 6. testSWQAndPTFAndGBy -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -163,7 +163,7 @@ group by p_mfgr, p_name, p_size -- 7. testJoin -explain vectorization extended +explain vectorization detail select abc.* from noop(on part_orc partition by p_mfgr @@ -178,7 +178,7 @@ order by p_name -- 8. testJoinRight -explain vectorization extended +explain vectorization detail select abc.* from part_orc p1 join noop(on part_orc partition by p_mfgr @@ -193,7 +193,7 @@ order by p_name -- 9. testNoopWithMap -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name, p_size desc) as r from noopwithmap(on part_orc @@ -208,7 +208,7 @@ order by p_name, p_size desc); -- 10. testNoopWithMapWithWindowing -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -227,7 +227,7 @@ from noopwithmap(on part_orc -- 11. testHavingWithWindowingPTFNoGBY -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -248,7 +248,7 @@ order by p_name) -- 12. testFunctionChain -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, p_size, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -269,7 +269,7 @@ order by p_mfgr, p_name -- 13. testPTFAndWindowingInSubQ -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, sub1.cd, sub1.s1 from (select p_mfgr, p_name, @@ -296,7 +296,7 @@ window w1 as (partition by p_mfgr order by p_name rows between 2 preceding and 2 -- 14. testPTFJoinWithWindowingWithCount -explain vectorization extended +explain vectorization detail select abc.p_mfgr, abc.p_name, rank() over (distribute by abc.p_mfgr sort by abc.p_name) as r, dense_rank() over (distribute by abc.p_mfgr sort by abc.p_name) as dr, @@ -323,7 +323,7 @@ order by p_name -- 15. testDistinctInSelectWithPTF -explain vectorization extended +explain vectorization detail select DISTINCT p_mfgr, p_name, p_size from noop(on part_orc partition by p_mfgr @@ -342,7 +342,7 @@ round(sum(p_retailprice),2) as s from part_orc group by p_mfgr, p_brand; -explain vectorization extended +explain vectorization detail select p_mfgr, p_brand, s, round(sum(s) over w1,2) as s1 from noop(on mfgr_price_view @@ -376,7 +376,7 @@ dr INT, cud DOUBLE, fv1 INT); -explain vectorization extended +explain vectorization detail from noop(on part_orc partition by p_mfgr order by p_name) @@ -413,7 +413,7 @@ select * from part_5; -- 18. testMulti2OperatorsFunctionChainWithMap -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, rank() over (partition by p_mfgr,p_name) as r, dense_rank() over (partition by p_mfgr,p_name) as dr, @@ -448,7 +448,7 @@ from noop(on -- 19. testMulti3OperatorsFunctionChain -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -483,7 +483,7 @@ from noop(on -- 20. testMultiOperatorChainWithNoWindowing -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, @@ -515,7 +515,7 @@ from noop(on -- 21. testMultiOperatorChainEndsWithNoopMap -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, rank() over (partition by p_mfgr,p_name) as r, dense_rank() over (partition by p_mfgr,p_name) as dr, @@ -550,7 +550,7 @@ from noopwithmap(on -- 22. testMultiOperatorChainWithDiffPartitionForWindow1 -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, rank() over (partition by p_mfgr,p_name order by p_mfgr,p_name) as r, dense_rank() over (partition by p_mfgr,p_name order by p_mfgr,p_name) as dr, @@ -583,7 +583,7 @@ from noop(on -- 23. testMultiOperatorChainWithDiffPartitionForWindow2 -explain vectorization extended +explain vectorization detail select p_mfgr, p_name, rank() over (partition by p_mfgr order by p_name) as r, dense_rank() over (partition by p_mfgr order by p_name) as dr, diff --git ql/src/test/queries/clientpositive/windowing_windowspec.q ql/src/test/queries/clientpositive/windowing_windowspec.q index 08b7d5c..c37aed3 100644 --- ql/src/test/queries/clientpositive/windowing_windowspec.q +++ ql/src/test/queries/clientpositive/windowing_windowspec.q @@ -31,6 +31,8 @@ select s, sum(i) over(partition by ts order by s) from over10k limit 100; select f, sum(f) over (partition by ts order by f range between unbounded preceding and current row) from over10k limit 100; +select f, sum(f) over (partition by ts order by f rows between 2 preceding and 1 preceding) from over10k limit 100; + select s, i, round(avg(d) over (partition by s order by i) / 10.0 , 2) from over10k limit 7; select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),2) from over10k window w1 as (partition by s order by i) limit 7;