diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java index 246170d..3637eec 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -21,13 +21,13 @@ import org.apache.hadoop.io.Writable; /** - * This class supports string and binary data by value reference -- i.e. each field is + * This class supports string and binary data by value reference -- i.e. each field is * explicitly present, as opposed to provided by a dictionary reference. * In some cases, all the values will be in the same byte array to begin with, - * but this need not be the case. If each value is in a separate byte + * but this need not be the case. If each value is in a separate byte * array to start with, or not all of the values are in the same original * byte array, you can still assign data by reference into this column vector. - * This gives flexibility to use this in multiple situations. + * This gives flexibility to use this in multiple situations. *
* When setting data by reference, the caller * is responsible for allocating the byte arrays used to hold the data. @@ -36,23 +36,23 @@ * though that use is probably not typical. */ public class BytesColumnVector extends ColumnVector { - public byte[][] vector; + public byte[][] vector; public int[] start; // start offset of each field - + /* - * The length of each field. If the value repeats for every entry, then it is stored + * The length of each field. If the value repeats for every entry, then it is stored * in vector[0] and isRepeating from the superclass is set to true. */ - public int[] length; + public int[] length; private byte[] buffer; // optional buffer to use when actually copying in data private int nextFree; // next free position in buffer - + // Estimate that there will be 16 bytes per entry static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE; - - // Proportion of extra space to provide when allocating more buffer space. + + // Proportion of extra space to provide when allocating more buffer space. static final float EXTRA_SPACE_FACTOR = (float) 1.2; - + /** * Use this constructor for normal operation. * All column vectors should be the default size normally. @@ -60,21 +60,21 @@ public BytesColumnVector() { this(VectorizedRowBatch.DEFAULT_SIZE); } - + /** * Don't call this constructor except for testing purposes. - * + * * @param size number of elements in the column vector */ public BytesColumnVector(int size) { super(size); vector = new byte[size][]; start = new int[size]; - length = new int[size]; + length = new int[size]; } - + /** Set a field by reference. - * + * * @param elementNum index within column vector to set * @param sourceBuf container of source data * @param start start byte position within source @@ -85,37 +85,37 @@ public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { this.start[elementNum] = start; this.length[elementNum] = length; } - - /** + + /** * You must call initBuffer first before using setVal(). * Provide the estimated number of bytes needed to hold * a full column vector worth of byte string data. - * + * * @param estimatedValueSize Estimated size of buffer space needed */ public void initBuffer(int estimatedValueSize) { nextFree = 0; - + // if buffer is already allocated, keep using it, don't re-allocate if (buffer != null) { return; } - + // allocate a little extra space to limit need to re-allocate int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR); if (bufferSize < DEFAULT_BUFFER_SIZE) { bufferSize = DEFAULT_BUFFER_SIZE; } - buffer = new byte[bufferSize]; + buffer = new byte[bufferSize]; } - + /** * Initialize buffer to default size. */ public void initBuffer() { initBuffer(0); } - + /** * @return amount of buffer space currently allocated */ @@ -125,13 +125,13 @@ public int bufferSize() { } return buffer.length; } - + /** * Set a field by actually copying in to a local buffer. * If you must actually copy data in to the array, use this method. * DO NOT USE this method unless it's not practical to set data by reference with setRef(). * Setting data by reference tends to run a lot faster than copying data in. - * + * * @param elementNum index within column vector to set * @param sourceBuf container of source data * @param start start byte position within source @@ -147,24 +147,52 @@ public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { this.length[elementNum] = length; nextFree += length; } - + + /** + * Set a field to the concatenation of two string values. Result data is copied + * into the internal buffer. + * + * @param elementNum index within column vector to set + * @param leftSourceBuf container of left argument + * @param leftStart start of left argument + * @param leftLen length of left argument + * @param rightSourceBuf container of right argument + * @param rightStart start of right argument + * @param rightLen length of right arugment + */ + public void setConcat(int elementNum, byte[] leftSourceBuf, int leftStart, int leftLen, + byte[] rightSourceBuf, int rightStart, int rightLen) { + int newLen = leftLen + rightLen; + if ((nextFree + newLen) > buffer.length) { + increaseBufferSpace(newLen); + } + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = newLen; + + System.arraycopy(leftSourceBuf, leftStart, buffer, nextFree, leftLen); + nextFree += leftLen; + System.arraycopy(rightSourceBuf, rightStart, buffer, nextFree, rightLen); + nextFree += rightLen; + } + /** * Increase buffer space enough to accommodate next element. - * This uses an exponential increase mechanism to rapidly + * This uses an exponential increase mechanism to rapidly * increase buffer size to enough to hold all data. * As batches get re-loaded, buffer space allocated will quickly * stabilize. - * + * * @param nextElemLength size of next element to be added */ public void increaseBufferSpace(int nextElemLength) { - + // Keep doubling buffer size until there will be enough space for next element. - int newLength = 2 * buffer.length; + int newLength = 2 * buffer.length; while((nextFree + nextElemLength) > newLength) { newLength *= 2; } - + // Allocate new buffer, copy data to it, and set buffer to new buffer. byte[] newBuffer = new byte[newLength]; System.arraycopy(buffer, 0, newBuffer, 0, nextFree); @@ -173,9 +201,8 @@ public void increaseBufferSpace(int nextElemLength) { @Override public Writable getWritableObject(int index) { - + // TODO finish this throw new UnsupportedOperationException("unfinished"); } - } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatColCol.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatColCol.java new file mode 100644 index 0000000..d1af319 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatColCol.java @@ -0,0 +1,414 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Vectorized instruction to concatenate two string columns and put + * the output in a third column. + */ +public class StringConcatColCol extends VectorExpression { + private int colNum1; + private int colNum2; + private int outputColumn; + + public StringConcatColCol(int colNum1, int colNum2, int outputColumn) { + this.colNum1 = colNum1; + this.colNum2 = colNum2; + this.outputColumn = outputColumn; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + BytesColumnVector inV1 = (BytesColumnVector) batch.cols[colNum1]; + BytesColumnVector inV2 = (BytesColumnVector) batch.cols[colNum2]; + BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn]; + int[] sel = batch.selected; + int n = batch.size; + byte[][] vector1 = inV1.vector; + byte[][] vector2 = inV2.vector; + int[] len1 = inV1.length; + int[] len2 = inV2.length; + int[] start1 = inV1.start; + int[] start2 = inV2.start; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + // prepare output buffer to accept results + outV.initBuffer(); + + /* Handle default case for isRepeating setting for output. This will be set to true + * later in the special cases where that is necessary. + */ + outV.isRepeating = false; + + if (inV1.noNulls && !inV2.noNulls) { + + // propagate nulls + + /* We'll assume that there *may* be nulls in the input if !noNulls is true + * for an input vector. This is to be more forgiving of errors in loading + * the vectors. A properly-written vectorized iterator will make sure that + * isNull[0] is set if !noNulls and isRepeating are true for the vector. + */ + outV.noNulls = false; + if (inV2.isRepeating) { + if (inV2.isNull[0]) { + + // Output will also be repeating and null + outV.isNull[0] = true; + outV.isRepeating = true; + + //return as no further processing is needed + return; + } + } else { + propagateNulls(batch.selectedInUse, n, sel, inV2, outV); + } + + // perform data operation + if (inV1.isRepeating && inV2.isRepeating) { + + /* All must be selected otherwise size would be zero. + * Repeating property will not change. + */ + if (!inV2.isNull[0]) { + outV.setConcat(0, vector1[0], start1[0], len1[0], vector2[0], start2[0], len2[0]); + } + outV.isRepeating = true; + } else if (inV1.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inV2.isNull[i]) { + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV2.isNull[0]) { + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } + } + } else if (inV2.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inV2.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV2.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } + } + } else { + if (batch.selectedInUse) { + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!inV2.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV2.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } + } + } + } else if (!inV1.noNulls && inV2.noNulls) { + + // propagate nulls + outV.noNulls = false; + if (inV1.isRepeating) { + + //Output will also be repeating and null + outV.isRepeating = true; + outV.isNull[0] = true; + + //return as no further processing is needed + return; + } else { + propagateNulls(batch.selectedInUse, n, sel, inV1, outV); + } + + // perform data operation + if (inV1.isRepeating && inV2.isRepeating) { + //All must be selected otherwise size would be zero + //Repeating property will not change. + if (!inV1.isNull[0]) { + outV.setConcat(0, vector1[0], start1[0], len1[0], vector2[0], start2[0], len2[0]); + } + outV.isRepeating = true; + } else if (inV1.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inV1.isNull[0]) { + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV1.isNull[0]) { + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } + } + } else if (inV2.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inV1.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV1.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } + } + } else { + if (batch.selectedInUse) { + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!inV1.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV1.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } + } + } + } else if (!inV1.noNulls && !inV2.noNulls) { + + // propagate nulls + outV.noNulls = false; + if (inV1.isRepeating && inV2.isRepeating) { + outV.isNull[0] = inV1.isNull[0] || inV2.isNull[0]; + + //Output will also be repeating + outV.isRepeating = true; + + // return if output is null because no additional work is needed + if (outV.isNull[0]) { + return; + } + } else if (inV1.isRepeating) { + if (inV1.isNull[0]) { // then all output will be null + outV.isRepeating = true; + outV.isNull[0] = true; + return; + } else { + outV.isRepeating = false; + propagateNulls(batch.selectedInUse, n, sel, inV2, outV); + } + } else if (inV2.isRepeating) { + if (inV2.isNull[0]) { + outV.isRepeating = true; + outV.isNull[0] = true; + return; + } else { + outV.isRepeating = false; + propagateNulls(batch.selectedInUse, n, sel, inV1, outV); + } + } else { + propagateNullsCombine(batch.selectedInUse, n, sel, inV1, inV2, outV); + } + + // perform data operation + if (inV1.isRepeating && inV2.isRepeating) { + + // All must be selected otherwise size would be zero. Repeating property will not change. + if (!inV1.isNull[0] && !inV2.isNull[0]) { + outV.setConcat(0, vector1[0], start1[0], len1[0], vector2[0], start2[0], len2[0]); + } + outV.isRepeating = true; + } else if (inV1.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inV1.isNull[0] && !inV2.isNull[i]) { + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV1.isNull[0] && !inV2.isNull[i]) { + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } + } + } else if (inV2.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inV1.isNull[i] && !inV2.isNull[0]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV1.isNull[i] && !inV2.isNull[0]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } + } + } else { + if (batch.selectedInUse) { + for(int j=0; j != n; j++) { + int i = sel[j]; + if (!inV1.isNull[i] && !inV2.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } + } else { + for(int i = 0; i != n; i++) { + if (!inV1.isNull[i] && !inV2.isNull[i]) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } + } + } + } else { // there are no nulls in either input vector + + // propagate null information + outV.noNulls = true; + + // perform data operation + if (inV1.isRepeating && inV2.isRepeating) { + + // All must be selected otherwise size would be zero. Repeating property will not change. + outV.setConcat(0, vector1[0], start1[0], len1[0], vector2[0], start2[0], len2[0]); + outV.isRepeating = true; + } else if (inV1.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } else { + for(int i = 0; i != n; i++) { + outV.setConcat(i, vector1[0], start1[0], len1[0], vector2[i], start2[i], len2[i]); + } + } + } else if (inV2.isRepeating) { + if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } else { + for(int i = 0; i != n; i++) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[0], start2[0], len2[0]); + } + } + } else { + if (batch.selectedInUse) { + for(int j=0; j != n; j++) { + int i = sel[j]; + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } else { + for(int i = 0; i != n; i++) { + outV.setConcat(i, vector1[i], start1[i], len1[i], vector2[i], start2[i], len2[i]); + } + } + } + } + } + + /** + * Propagate the logic OR of null vectors from two inputs to output. + * + * @param selectedInUse true/false flag to tell if sel[] is in use + * @param n number of qualifying rows + * @param sel selected value position array + * @param inV1 input vector 1 + * @param inV2 input vector 2 + * @param outV output vector + */ + private static void propagateNullsCombine(boolean selectedInUse, int n, int[] sel, + ColumnVector inV1, ColumnVector inV2, BytesColumnVector outV) { + if (selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.isNull[i] = inV1.isNull[i] || inV2.isNull[i]; + } + } else { + for(int i = 0; i != n; i++) { + outV.isNull[i] = inV1.isNull[i] || inV2.isNull[i]; + } + } + } + + /** + * Propagate nulls from input vector inV to output vector outV. + * + * @param selectedInUse true/false flag to tell if sel[] is in use + * @param sel selected value position array + * @param n number of qualifying rows + * @param inV input vector + * @param outV ouput vector + */ + private static void propagateNulls(boolean selectedInUse, int n, int[] sel, ColumnVector inV, + ColumnVector outV) { + if (selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.isNull[i] = inV.isNull[i]; + } + } else { + System.arraycopy(inV.isNull, 0, outV.isNull, 0, n); + } + } + + @Override + public int getOutputColumn() { + return outputColumn; + } + + @Override + public String getOutputType() { + return "double"; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatColScalar.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatColScalar.java new file mode 100644 index 0000000..29ef244 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatColScalar.java @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Vectorized instruction to concatenate a string column to a scalar and put + * the result in an output column. + */ +public class StringConcatColScalar extends VectorExpression { + private int colNum; + private int outputColumn; + private byte[] value; + + StringConcatColScalar(int colNum, int outputColumn, byte[] value) { + this.colNum = colNum; + this.outputColumn = outputColumn; + this.value = value; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn]; + int[] sel = batch.selected; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] start = inputColVector.start; + int[] length = inputColVector.length; + + if (n == 0) { + + // Nothing to do + return; + } + + // initialize output vector buffer to receive data + outV.initBuffer(); + + if (inputColVector.noNulls) { + outV.noNulls = true; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.setConcat(0, vector[0], start[0], length[0], value, 0, value.length); + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.setConcat(i, vector[i], start[i], length[i], value, 0, value.length); + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + outV.setConcat(i, vector[i], start[i], length[i], value, 0, value.length); + } + outV.isRepeating = false; + } + } else { + + /* + * Handle case with nulls. Don't do function if the value is null, to save time, + * because calling the function can be expensive. + */ + outV.noNulls = false; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.isNull[0] = inputColVector.isNull[0]; + if (!inputColVector.isNull[0]) { + outV.setConcat(0, vector[0], start[0], length[0], value, 0, value.length); + } + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inputColVector.isNull[i]) { + outV.setConcat(i, vector[i], start[i], length[i], value, 0, value.length); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + if (!inputColVector.isNull[i]) { + outV.setConcat(i, vector[i], start[i], length[i], value, 0, value.length); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } + } + } + + @Override + public int getOutputColumn() { + return outputColumn; + } + + @Override + public String getOutputType() { + return "String"; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatScalarCol.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatScalarCol.java new file mode 100644 index 0000000..14205ee --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringConcatScalarCol.java @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * Vectorized instruction to concatenate a scalar to a string column and put + * the result in an output column. + */ +public class StringConcatScalarCol extends VectorExpression { + private int colNum; + private int outputColumn; + private byte[] value; + + StringConcatScalarCol(byte[] value, int colNum, int outputColumn) { + this.colNum = colNum; + this.outputColumn = outputColumn; + this.value = value; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn]; + int[] sel = batch.selected; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] start = inputColVector.start; + int[] length = inputColVector.length; + + if (n == 0) { + + // Nothing to do + return; + } + + // initialize output vector buffer to receive data + outV.initBuffer(); + + if (inputColVector.noNulls) { + outV.noNulls = true; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.setConcat(0, value, 0, value.length, vector[0], start[0], length[0]); + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + outV.setConcat(i, value, 0, value.length, vector[i], start[i], length[i]); + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + outV.setConcat(i, value, 0, value.length, vector[i], start[i], length[i]); + } + outV.isRepeating = false; + } + } else { + + /* + * Handle case with nulls. Don't do function if the value is null, to save time, + * because calling the function can be expensive. + */ + outV.noNulls = false; + if (inputColVector.isRepeating) { + outV.isRepeating = true; + outV.isNull[0] = inputColVector.isNull[0]; + if (!inputColVector.isNull[0]) { + outV.setConcat(0, value, 0, value.length, vector[0], start[0], length[0]); + } + } else if (batch.selectedInUse) { + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!inputColVector.isNull[i]) { + outV.setConcat(i, value, 0, value.length, vector[i], start[i], length[i]); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } else { + for(int i = 0; i != n; i++) { + if (!inputColVector.isNull[i]) { + outV.setConcat(i, value, 0, value.length, vector[i], start[i], length[i]); + } + outV.isNull[i] = inputColVector.isNull[i]; + } + outV.isRepeating = false; + } + } + } + + @Override + public int getOutputColumn() { + return outputColumn; + } + + @Override + public String getOutputType() { + return "String"; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java index c0e71e5..6371d66 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java @@ -37,21 +37,29 @@ * Test vectorized expression and filter evaluation for strings. */ public class TestVectorStringExpressions { - - private static byte[] red; + + private static byte[] red; + private static byte[] redred; private static byte[] red2; // second copy of red, different object private static byte[] green; + private static byte[] greenred; + private static byte[] redgreen; + private static byte[] greengreen; private static byte[] emptyString; private static byte[] mixedUp; private static byte[] mixedUpLower; private static byte[] mixedUpUpper; private static byte[] multiByte; private static byte[] mixPercentPattern; - + static { try { red = "red".getBytes("UTF-8"); + redred = "redred".getBytes("UTF-8"); green = "green".getBytes("UTF-8"); + greenred = "greenred".getBytes("UTF-8"); + redgreen = "redgreen".getBytes("UTF-8"); + greengreen = "greengreen".getBytes("UTF-8"); emptyString = "".getBytes("UTF-8"); mixedUp = "mixedUp".getBytes("UTF-8"); mixedUpLower = "mixedup".getBytes("UTF-8"); @@ -134,37 +142,38 @@ public void testStringColCompareStringScalarFilter() { VectorExpression expr; expr = new FilterStringColEqualStringScalar(0, red2); expr.evaluate(batch); - + // only red qualifies, and it's in entry 0 Assert.assertTrue(batch.size == 1); Assert.assertTrue(batch.selected[0] == 0); - + batch = makeStringBatch(); expr = new FilterStringColLessStringScalar(0, red2); expr.evaluate(batch); - + // only green qualifies, and it's in entry 1 - Assert.assertTrue(batch.size == 1); - Assert.assertTrue(batch.selected[0] == 1); - + Assert.assertTrue(batch.size == 1); + Assert.assertTrue(batch.selected[0] == 1); + batch = makeStringBatch(); expr = new FilterStringColGreaterEqualStringScalar(0, green); expr.evaluate(batch); - + // green and red qualify - Assert.assertTrue(batch.size == 2); - Assert.assertTrue(batch.selected[0] == 0); - Assert.assertTrue(batch.selected[1] == 1); + Assert.assertTrue(batch.size == 2); + Assert.assertTrue(batch.selected[0] == 0); + Assert.assertTrue(batch.selected[1] == 1); } - + VectorizedRowBatch makeStringBatch() { // create a batch with one string ("Bytes") column - VectorizedRowBatch batch = new VectorizedRowBatch(1, VectorizedRowBatch.DEFAULT_SIZE); - BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + VectorizedRowBatch batch = new VectorizedRowBatch(2); + BytesColumnVector v = new BytesColumnVector(); batch.cols[0] = v; + batch.cols[1] = new BytesColumnVector(); // to hold output if needed /* * Add these 3 values: - * + * * red * green * NULL @@ -175,13 +184,13 @@ VectorizedRowBatch makeStringBatch() { v.isNull[1] = false; v.setRef(2, emptyString, 0, emptyString.length); v.isNull[2] = true; - + v.noNulls = false; - + batch.size = 3; return batch; } - + VectorizedRowBatch makeStringBatchMixedCase() { // create a batch with two string ("Bytes") columns VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); @@ -191,7 +200,7 @@ VectorizedRowBatch makeStringBatchMixedCase() { batch.cols[1] = outV; /* * Add these 3 values: - * + * * mixedUp * green * NULL @@ -203,23 +212,23 @@ VectorizedRowBatch makeStringBatchMixedCase() { v.setRef(2, emptyString, 0, emptyString.length); v.isNull[2] = true; v.noNulls = false; - + batch.size = 3; return batch; } - + VectorizedRowBatch makeStringBatchMixedCharSize() { - // create a new batch with one char column (for input) and one long column (for output) + // create a new batch with one char column (for input) and one long column (for output) VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE); BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[0] = v; LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); batch.cols[1] = outV; - + /* * Add these 3 values: - * + * * mixedUp * green * NULL @@ -234,11 +243,11 @@ VectorizedRowBatch makeStringBatchMixedCharSize() { v.noNulls = false; v.setRef(3, multiByte, 0, 10); v.isNull[3] = false; - + batch.size = 4; return batch; } - + @Test public void testColLower() { // has nulls, not repeating @@ -246,53 +255,53 @@ public void testColLower() { StringLower expr = new StringLower(0, 1); expr.evaluate(batch); BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; - int cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + int cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.isNull[2]); - int cmp2 = StringExpr.compare(green, 0, green.length, outCol.vector[1], + int cmp2 = StringExpr.compare(green, 0, green.length, outCol.vector[1], outCol.start[1], outCol.length[1]); Assert.assertEquals(0, cmp2); - + // no nulls, not repeating batch = makeStringBatchMixedCase(); batch.cols[0].noNulls = true; expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; - cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.noNulls); - + // has nulls, is repeating batch = makeStringBatchMixedCase(); batch.cols[0].isRepeating = true; expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; - cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.isRepeating); Assert.assertFalse(outCol.noNulls); - + // no nulls, is repeating batch = makeStringBatchMixedCase(); batch.cols[0].isRepeating = true; batch.cols[0].noNulls = true; expr.evaluate(batch); outCol = (BytesColumnVector) batch.cols[1]; - cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], + cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]); Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.isRepeating); - Assert.assertTrue(outCol.noNulls); + Assert.assertTrue(outCol.noNulls); } - + @Test public void testColUpper() { // no nulls, not repeating - + /* We don't test all the combinations because (at least currently) * the logic is inherited to be the same as testColLower, which checks all the cases). */ @@ -301,15 +310,15 @@ public void testColUpper() { batch.cols[0].noNulls = true; expr.evaluate(batch); BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; - int cmp = StringExpr.compare(mixedUpUpper, 0, mixedUpUpper.length, outCol.vector[0], + int cmp = StringExpr.compare(mixedUpUpper, 0, mixedUpUpper.length, outCol.vector[0], outCol.start[0], outCol.length[0]); Assert.assertEquals(0, cmp); Assert.assertTrue(outCol.noNulls); } - + @Test public void testStringLength() { - + // has nulls, not repeating VectorizedRowBatch batch = makeStringBatchMixedCharSize(); StringLength expr = new StringLength(0, 1); @@ -326,7 +335,7 @@ public void testStringLength() { outCol = (LongColumnVector) batch.cols[1]; Assert.assertTrue(outCol.noNulls); Assert.assertEquals(4, outCol.vector[3]); // this one has the mixed-size chars - + // has nulls, is repeating batch = makeStringBatchMixedCharSize(); batch.cols[0].isRepeating = true; @@ -347,6 +356,34 @@ public void testStringLength() { Assert.assertTrue(outCol.noNulls); } + private VectorizedRowBatch makeStringBatch2In1Out() { + VectorizedRowBatch batch = new VectorizedRowBatch(3); + BytesColumnVector v = new BytesColumnVector(); + batch.cols[0] = v; + BytesColumnVector v2 = new BytesColumnVector(); + batch.cols[1] = v2; + batch.cols[2] = new BytesColumnVector(); + + v.setRef(0, red, 0, red.length); + v.isNull[0] = false; + v.setRef(1, green, 0, green.length); + v.isNull[1] = false; + v.setRef(2, emptyString, 0, emptyString.length); + v.isNull[2] = true; + v.noNulls = false; + + v2.setRef(0, red, 0, red.length); + v2.isNull[0] = false; + v2.setRef(1, green, 0, green.length); + v2.isNull[1] = false; + v2.setRef(2, emptyString, 0, emptyString.length); + v2.isNull[2] = true; + v2.noNulls = false; + + batch.size = 3; + return batch; + } + @Test public void testStringLike() { @@ -400,4 +437,273 @@ public void testStringLike() { // all rows qualify Assert.assertEquals(initialBatchSize, batch.size); } + + @Test + public void testColConcatScalar() { + + // has nulls, not repeating + VectorizedRowBatch batch = makeStringBatch(); + StringConcatColScalar expr = new StringConcatColScalar(0, 1, red); + expr.evaluate(batch); + BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; + + int cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertTrue(outCol.isNull[2]); + int cmp2 = StringExpr.compare(greenred, 0, greenred.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp2); + Assert.assertFalse(outCol.noNulls); + Assert.assertFalse(outCol.isRepeating); + + // no nulls, not repeating + batch = makeStringBatch(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + + cmp2 = StringExpr.compare(greenred, 0, greenred.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp2); + + int cmp3 = StringExpr.compare(red, 0, red.length, outCol.vector[2], + outCol.start[2], outCol.length[2]); + Assert.assertEquals(0, cmp3); + + Assert.assertTrue(outCol.noNulls); + Assert.assertFalse(outCol.isRepeating); + + // has nulls, is repeating + batch = makeStringBatch(); + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertTrue(outCol.isRepeating); + Assert.assertFalse(outCol.noNulls); + + // no nulls, is repeating + batch = makeStringBatch(); + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertTrue(outCol.isRepeating); + Assert.assertTrue(outCol.noNulls); + } + + @Test + public void testScalarConcatCol() { + + // has nulls, not repeating + VectorizedRowBatch batch = makeStringBatch(); + StringConcatScalarCol expr = new StringConcatScalarCol(red, 0, 1); + expr.evaluate(batch); + BytesColumnVector outCol = (BytesColumnVector) batch.cols[1]; + + int cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertTrue(outCol.isNull[2]); + int cmp2 = StringExpr.compare(redgreen, 0, redgreen.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp2); + Assert.assertFalse(outCol.noNulls); + Assert.assertFalse(outCol.isRepeating); + + // no nulls, not repeating + batch = makeStringBatch(); + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + + cmp2 = StringExpr.compare(redgreen, 0, redgreen.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp2); + + int cmp3 = StringExpr.compare(red, 0, red.length, outCol.vector[2], + outCol.start[2], outCol.length[2]); + Assert.assertEquals(0, cmp3); + + Assert.assertTrue(outCol.noNulls); + Assert.assertFalse(outCol.isRepeating); + + // has nulls, is repeating + batch = makeStringBatch(); + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertTrue(outCol.isRepeating); + Assert.assertFalse(outCol.noNulls); + + // no nulls, is repeating + batch = makeStringBatch(); + batch.cols[0].isRepeating = true; + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[1]; + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertTrue(outCol.isRepeating); + Assert.assertTrue(outCol.noNulls); + } + + @Test + public void testColConcatCol() { + + // has nulls, not repeating + VectorizedRowBatch batch = makeStringBatch2In1Out(); + StringConcatColCol expr = new StringConcatColCol(0, 1, 2); + expr.evaluate(batch); + BytesColumnVector outCol = (BytesColumnVector) batch.cols[2]; + + int cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertTrue(outCol.isNull[2]); + int cmp2 = StringExpr.compare(greengreen, 0, greengreen.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp2); + Assert.assertFalse(outCol.noNulls); + Assert.assertFalse(outCol.isRepeating); + + // no nulls, not repeating + batch = makeStringBatch2In1Out(); + batch.cols[0].noNulls = true; + batch.cols[1].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + + cmp2 = StringExpr.compare(greengreen, 0, greengreen.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp2); + + int cmp3 = StringExpr.compare(emptyString, 0, emptyString.length, + outCol.vector[2], outCol.start[2], outCol.length[2]); + Assert.assertEquals(0, cmp3); + + Assert.assertTrue(outCol.noNulls); + Assert.assertFalse(outCol.isRepeating); + + // has nulls, is repeating + + batch = makeStringBatch2In1Out(); + batch.cols[0].isRepeating = true; // only left input repeating + batch.cols[0].isNull[0] = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + + Assert.assertEquals(3, batch.size); + Assert.assertEquals(true, outCol.isRepeating); + Assert.assertEquals(true, outCol.isNull[0]); + + // same, but repeating input is not null + + batch = makeStringBatch2In1Out(); + batch.cols[0].isRepeating = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + Assert.assertEquals(false, outCol.isRepeating); //TEST FAILED + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + Assert.assertEquals(true, outCol.isNull[2]); + + batch = makeStringBatch2In1Out(); + batch.cols[1].isRepeating = true; // only right input repeating + batch.cols[1].isNull[0] = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + + Assert.assertEquals(3, batch.size); + Assert.assertEquals(true, outCol.isRepeating); + Assert.assertEquals(true, outCol.isNull[0]); + + batch = makeStringBatch2In1Out(); + batch.cols[0].isRepeating = true; // both inputs repeat + batch.cols[0].isNull[0] = true; + batch.cols[1].isRepeating = true; + batch.cols[1].isNull[0] = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + + Assert.assertEquals(3, batch.size); + Assert.assertEquals(true, outCol.isRepeating); + Assert.assertEquals(true, outCol.isNull[0]); + + // no nulls, is repeating + batch = makeStringBatch2In1Out(); + batch.cols[1].isRepeating = true; // only right input repeating and has no nulls + batch.cols[1].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + + Assert.assertEquals(3, batch.size); + Assert.assertEquals(false, outCol.isRepeating); + Assert.assertEquals(false, outCol.isNull[0]); + Assert.assertEquals(false, outCol.noNulls); + Assert.assertEquals(true, outCol.isNull[2]); + cmp = StringExpr.compare(greenred, 0, greenred.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp); + + // try again with left input also having no nulls + batch.cols[0].noNulls = true; + expr.evaluate(batch); + Assert.assertEquals(false, outCol.isRepeating); + Assert.assertEquals(true, outCol.noNulls); + cmp = StringExpr.compare(red, 0, red.length, outCol.vector[2], + outCol.start[2], outCol.length[2]); + Assert.assertEquals(0, cmp); + + batch = makeStringBatch2In1Out(); + batch.cols[0].isRepeating = true; // only left input repeating and has no nulls + batch.cols[0].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + + Assert.assertEquals(3, batch.size); + Assert.assertEquals(false, outCol.isRepeating); + Assert.assertEquals(false, outCol.isNull[0]); + Assert.assertEquals(false, outCol.noNulls); + Assert.assertEquals(true, outCol.isNull[2]); + cmp = StringExpr.compare(redgreen, 0, redgreen.length, outCol.vector[1], + outCol.start[1], outCol.length[1]); + Assert.assertEquals(0, cmp); + + batch = makeStringBatch2In1Out(); + batch.cols[0].isRepeating = true; // both inputs repeat + batch.cols[0].noNulls = true; + batch.cols[1].isRepeating = true; + batch.cols[1].noNulls = true; + expr.evaluate(batch); + outCol = (BytesColumnVector) batch.cols[2]; + + Assert.assertEquals(3, batch.size); + Assert.assertEquals(true, outCol.isRepeating); + Assert.assertEquals(false, outCol.isNull[0]); + cmp = StringExpr.compare(redred, 0, redred.length, outCol.vector[0], + outCol.start[0], outCol.length[0]); + Assert.assertEquals(0, cmp); + } }