diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapper.java
index 35712d0..d9d10a2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapper.java
@@ -20,29 +20,38 @@
import java.util.Arrays;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
/**
* A hash map key wrapper for vectorized processing.
* It stores the key values as primitives in arrays for each supported primitive type.
- * This works in conjunction with
+ * This works in conjunction with
* {@link org.apache.hadoop.hive.ql.exec.VectorHashKeyWrapperBatch VectorHashKeyWrapperBatch}
- * to hash vectorized processing units (batches).
+ * to hash vectorized processing units (batches).
*/
public class VectorHashKeyWrapper extends KeyWrapper {
-
+
private long[] longValues;
private double[] doubleValues;
+
+ private byte[][] byteValues;
+ private int[] byteStarts;
+ private int[] byteLengths;
+
private boolean[] isNull;
private int hashcode;
-
- public VectorHashKeyWrapper(int longValuesCount, int doubleValuesCount) {
+
+ public VectorHashKeyWrapper(int longValuesCount, int doubleValuesCount, int byteValuesCount) {
longValues = new long[longValuesCount];
doubleValues = new double[doubleValuesCount];
- isNull = new boolean[longValuesCount + doubleValuesCount];
+ byteValues = new byte[byteValuesCount][];
+ byteStarts = new int[byteValuesCount];
+ byteLengths = new int[byteValuesCount];
+ isNull = new boolean[longValuesCount + doubleValuesCount + byteValuesCount];
}
-
+
private VectorHashKeyWrapper() {
}
@@ -56,32 +65,90 @@ void setHashKey() {
hashcode = Arrays.hashCode(longValues) ^
Arrays.hashCode(doubleValues) ^
Arrays.hashCode(isNull);
+
+ // This code, with branches and all, is not executed if there are no string keys
+ for (int i = 0; i < byteValues.length; ++i) {
+ /*
+ * Hashing the string is potentially expensive so is better to branch.
+ * Additionally not looking at values for nulls allows us not reset the values.
+ */
+ if (!isNull[longValues.length + doubleValues.length + i]) {
+ byte[] bytes = byteValues[i];
+ int start = byteStarts[i];
+ int length = byteLengths[i];
+ if (length == bytes.length && start == 0) {
+ hashcode ^= Arrays.hashCode(bytes);
+ }
+ else {
+ // Unfortunately there is no Arrays.hashCode(byte[], start, length)
+ for(int j = start; j < start + length; ++start) {
+ // use 461 as is a (sexy!) prime.
+ hashcode ^= 461 * bytes[j];
+ }
+ }
+ }
+ }
}
-
+
@Override
public int hashCode() {
return hashcode;
}
-
- @Override
+
+ @Override
public boolean equals(Object that) {
if (that instanceof VectorHashKeyWrapper) {
VectorHashKeyWrapper keyThat = (VectorHashKeyWrapper)that;
return hashcode == keyThat.hashcode &&
Arrays.equals(longValues, keyThat.longValues) &&
Arrays.equals(doubleValues, keyThat.doubleValues) &&
- Arrays.equals(isNull, keyThat.isNull);
+ Arrays.equals(isNull, keyThat.isNull) &&
+ byteValues.length == keyThat.byteValues.length &&
+ (0 == byteValues.length || bytesEquals(keyThat));
}
return false;
}
-
+
+ private boolean bytesEquals(VectorHashKeyWrapper keyThat) {
+ //By the time we enter here the byteValues.lentgh and isNull must have already been compared
+ for (int i = 0; i < byteValues.length; ++i) {
+ // the byte comparison is potentially expensive so is better to branch on null
+ if (!isNull[longValues.length + doubleValues.length + i]) {
+ if (0 != StringExpr.compare(
+ byteValues[i],
+ byteStarts[i],
+ byteLengths[i],
+ keyThat.byteValues[i],
+ keyThat.byteStarts[i],
+ keyThat.byteLengths[i])) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
@Override
protected Object clone() {
VectorHashKeyWrapper clone = new VectorHashKeyWrapper();
clone.longValues = longValues.clone();
clone.doubleValues = doubleValues.clone();
clone.isNull = isNull.clone();
+
+ clone.byteValues = new byte[byteValues.length][];
+ clone.byteStarts = new int[byteValues.length];
+ clone.byteLengths = byteLengths.clone();
+ for (int i = 0; i < byteValues.length; ++i) {
+ // avoid allocation/copy of nulls, because it potentially expensive. branch instead.
+ if (!isNull[i]) {
+ clone.byteValues[i] = Arrays.copyOfRange(
+ byteValues[i],
+ byteStarts[i],
+ byteStarts[i] + byteLengths[i]);
+ }
+ }
clone.hashcode = hashcode;
+ assert clone.equals(this);
return clone;
}
@@ -121,19 +188,32 @@ public void assignNullLong(int index) {
longValues[index] = 0; // assign 0 to simplify hashcode
isNull[index] = true;
}
-
+
+ public void assignString(int index, byte[] bytes, int start, int length) {
+ byteValues[index] = bytes;
+ byteStarts[index] = start;
+ byteLengths[index] = length;
+ isNull[longValues.length + doubleValues.length + index] = false;
+ }
+
+ public void assignNullString(int index) {
+ // We do not assign the value to [] because the value is never used on null
+ isNull[longValues.length + doubleValues.length + index] = true;
+ }
+
@Override
- public String toString()
+ public String toString()
{
- return String.format("%d[%s] %d[%s]",
+ return String.format("%d[%s] %d[%s] %d[%s]",
longValues.length, Arrays.toString(longValues),
- doubleValues.length, Arrays.toString(doubleValues));
+ doubleValues.length, Arrays.toString(doubleValues),
+ byteValues.length, Arrays.toString(byteValues));
}
public boolean getIsNull(int i) {
return isNull[i];
}
-
+
public long getLongValue(int i) {
return longValues[i];
}
@@ -142,4 +222,18 @@ public double getDoubleValue(int i) {
return doubleValues[i - longValues.length];
}
+ public byte[] getBytes(int i) {
+ return byteValues[i - longValues.length - doubleValues.length];
+ }
+
+ public int getByteStart(int i) {
+ return byteStarts[i - longValues.length - doubleValues.length];
+ }
+
+ public int getByteLength(int i) {
+ return byteLengths[i - longValues.length - doubleValues.length];
+ }
+
+
}
+
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapperBatch.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapperBatch.java
index c23614c..2312536 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapperBatch.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/VectorHashKeyWrapperBatch.java
@@ -19,22 +19,25 @@
package org.apache.hadoop.hive.ql.exec;
import java.util.Arrays;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
/**
- * Class for handling vectorized hash map key wrappers. It evaluates the key columns in a
+ * Class for handling vectorized hash map key wrappers. It evaluates the key columns in a
* row batch in a vectorized fashion.
* This class stores additional information about keys needed to evaluate and output the key values.
*
*/
public class VectorHashKeyWrapperBatch {
-
+
/**
* Helper class for looking up a key value based on key index
*
@@ -42,53 +45,61 @@
private static class KeyLookupHelper {
public int longIndex;
public int doubleIndex;
+ public int stringIndex;
}
-
+
/**
* The key expressions that require evaluation and output the primitive values for each key.
*/
private VectorExpression[] keyExpressions;
-
+
/**
* indices of LONG primitive keys
*/
private int[] longIndices;
-
+
/**
* indices of DOUBLE primitive keys
*/
private int[] doubleIndices;
-
+
+ /*
+ * indices of stirng (byte[]) primitive keys
+ */
+ private int[] stringIndices;
+
/**
- * pre-allocated batch size vector of keys wrappers.
+ * pre-allocated batch size vector of keys wrappers.
* N.B. these keys are **mutable** and should never be used in a HashMap.
- * Always clone the key wrapper to obtain an immutable keywrapper suitable
+ * Always clone the key wrapper to obtain an immutable keywrapper suitable
* to use a key in a HashMap.
*/
private VectorHashKeyWrapper[] vectorHashKeyWrappers;
-
+
/**
* lookup vector to map from key index to primitive type index
*/
private KeyLookupHelper[] indexLookup;
-
+
/**
- * preallocated and reused LongWritable objects for emiting row mode key values
+ * preallocated and reused LongWritable objects for emiting row mode key values
*/
private LongWritable[] longKeyValueOutput;
-
+
/**
* preallocated and reused DoubleWritable objects for emiting row mode key values
*/
private DoubleWritable[] doubleKeyValueOutput;
-
+
+ private BytesWritable[] stringKeyValueOutput;
+
/**
- * Accessor for the batch-sized array of key wrappers
+ * Accessor for the batch-sized array of key wrappers
*/
public VectorHashKeyWrapper[] getVectorHashKeyWrappers() {
return vectorHashKeyWrappers;
}
-
+
/**
* Processes a batch:
*
@@ -96,71 +107,191 @@
* - Copies out each key's primitive values into the key wrappers
* - computes the hashcode of the key wrappers
*
- * @param vrb
+ * @param batch
* @throws HiveException
*/
- public void evaluateBatch (VectorizedRowBatch vrb) throws HiveException {
+ public void evaluateBatch (VectorizedRowBatch batch) throws HiveException {
for(int i = 0; i < keyExpressions.length; ++i) {
- keyExpressions[i].evaluate(vrb);
+ keyExpressions[i].evaluate(batch);
}
for(int i = 0; i< longIndices.length; ++i) {
int keyIndex = longIndices[i];
int columnIndex = keyExpressions[keyIndex].getOutputColumn();
- LongColumnVector columnVector = (LongColumnVector) vrb.cols[columnIndex];
- if (columnVector.noNulls && !columnVector.isRepeating && !vrb.selectedInUse) {
- assignLongNoNullsNoRepeatingNoSelection(i, vrb.size, columnVector);
- } else if (columnVector.noNulls && !columnVector.isRepeating && vrb.selectedInUse) {
- assignLongNoNullsNoRepeatingSelection(i, vrb.size, columnVector, vrb.selected);
+ LongColumnVector columnVector = (LongColumnVector) batch.cols[columnIndex];
+ if (columnVector.noNulls && !columnVector.isRepeating && !batch.selectedInUse) {
+ assignLongNoNullsNoRepeatingNoSelection(i, batch.size, columnVector);
+ } else if (columnVector.noNulls && !columnVector.isRepeating && batch.selectedInUse) {
+ assignLongNoNullsNoRepeatingSelection(i, batch.size, columnVector, batch.selected);
} else if (columnVector.noNulls && columnVector.isRepeating) {
- assignLongNoNullsRepeating(i, vrb.size, columnVector);
- } else if (!columnVector.noNulls && !columnVector.isRepeating && !vrb.selectedInUse) {
- assignLongNullsNoRepeatingNoSelection(i, vrb.size, columnVector);
+ assignLongNoNullsRepeating(i, batch.size, columnVector);
+ } else if (!columnVector.noNulls && !columnVector.isRepeating && !batch.selectedInUse) {
+ assignLongNullsNoRepeatingNoSelection(i, batch.size, columnVector);
} else if (!columnVector.noNulls && columnVector.isRepeating) {
- assignLongNullsRepeating(i, vrb.size, columnVector);
- } else if (!columnVector.noNulls && !columnVector.isRepeating && vrb.selectedInUse) {
- assignLongNullsNoRepeatingSelection (i, vrb.size, columnVector, vrb.selected);
+ assignLongNullsRepeating(i, batch.size, columnVector);
+ } else if (!columnVector.noNulls && !columnVector.isRepeating && batch.selectedInUse) {
+ assignLongNullsNoRepeatingSelection (i, batch.size, columnVector, batch.selected);
} else {
throw new HiveException (String.format("Unimplemented Long null/repeat/selected combination %b/%b/%b",
- columnVector.noNulls, columnVector.isRepeating, vrb.selectedInUse));
+ columnVector.noNulls, columnVector.isRepeating, batch.selectedInUse));
}
}
for(int i=0;i= 0) {
doubleKeyValueOutput[klh.doubleIndex].set(kw.getDoubleValue(i));
return doubleKeyValueOutput[klh.doubleIndex];
+ } else if (klh.stringIndex >= 0) {
+ stringKeyValueOutput[klh.stringIndex].set(
+ kw.getBytes(i), kw.getByteStart(i), kw.getByteLength(i));
+ return stringKeyValueOutput[klh.stringIndex];
} else {
throw new HiveException(String.format(
- "Internal inconsistent KeyLookupHelper at index [%d]:%d %d",
- i, klh.longIndex, klh.doubleIndex));
+ "Internal inconsistent KeyLookupHelper at index [%d]:%d %d %d",
+ i, klh.longIndex, klh.doubleIndex, klh.stringIndex));
}
- }
+ }
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index 1ef4955..609aa61 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -1063,7 +1063,7 @@ public ColumnVector allocateColumnVector(String type, int defaultSize) {
}
}
- public ObjectInspector createObjectInspector(VectorExpression vectorExpression)
+ public ObjectInspector createObjectInspector(VectorExpression vectorExpression)
throws HiveException {
String columnType = vectorExpression.getOutputType();
if (columnType.equalsIgnoreCase("long") ||
@@ -1071,6 +1071,8 @@ public ObjectInspector createObjectInspector(VectorExpression vectorExpression)
return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
} else if (columnType.equalsIgnoreCase("double")) {
return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+ } else if (columnType.equalsIgnoreCase("string")) {
+ return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
} else {
throw new HiveException(String.format("Must implement type %s", columnType));
}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/FakeVectorRowBatchFromObjectIterables.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/FakeVectorRowBatchFromObjectIterables.java
new file mode 100644
index 0000000..6824ee7
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/FakeVectorRowBatchFromObjectIterables.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.vector.util.FakeVectorRowBatchBase;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+
+/**
+ * Test helper class that creates vectorized execution batches from arbitrary type iterables.
+ */
+public class FakeVectorRowBatchFromObjectIterables extends FakeVectorRowBatchBase {
+
+ private final String[] types;
+ private final List> iterators;
+ private final VectorizedRowBatch batch;
+ private boolean eof;
+ private final int batchSize;
+
+ /**
+ * Helper interface for assigning values to primitive vector column types.
+ */
+ private static interface ColumnVectorAssign
+ {
+ public void assign(
+ ColumnVector columnVector,
+ int row,
+ Object value);
+ }
+
+ private final ColumnVectorAssign[] columnAssign;
+
+ public FakeVectorRowBatchFromObjectIterables(int batchSize, String[] types,
+ Iterable