diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java new file mode 100644 index 0000000..246170d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -0,0 +1,181 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.io.Writable; + +/** + * This class supports string and binary data by value reference -- i.e. each field is + * explicitly present, as opposed to provided by a dictionary reference. + * In some cases, all the values will be in the same byte array to begin with, + * but this need not be the case. If each value is in a separate byte + * array to start with, or not all of the values are in the same original + * byte array, you can still assign data by reference into this column vector. + * This gives flexibility to use this in multiple situations. + *
+ * When setting data by reference, the caller + * is responsible for allocating the byte arrays used to hold the data. + * You can also set data by value, as long as you call the initBuffer() method first. + * You can mix "by value" and "by reference" in the same column vector, + * though that use is probably not typical. + */ +public class BytesColumnVector extends ColumnVector { + public byte[][] vector; + public int[] start; // start offset of each field + + /* + * The length of each field. If the value repeats for every entry, then it is stored + * in vector[0] and isRepeating from the superclass is set to true. + */ + public int[] length; + private byte[] buffer; // optional buffer to use when actually copying in data + private int nextFree; // next free position in buffer + + // Estimate that there will be 16 bytes per entry + static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE; + + // Proportion of extra space to provide when allocating more buffer space. + static final float EXTRA_SPACE_FACTOR = (float) 1.2; + + /** + * Use this constructor for normal operation. + * All column vectors should be the default size normally. + */ + public BytesColumnVector() { + this(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Don't call this constructor except for testing purposes. + * + * @param size number of elements in the column vector + */ + public BytesColumnVector(int size) { + super(size); + vector = new byte[size][]; + start = new int[size]; + length = new int[size]; + } + + /** Set a field by reference. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { + vector[elementNum] = sourceBuf; + this.start[elementNum] = start; + this.length[elementNum] = length; + } + + /** + * You must call initBuffer first before using setVal(). + * Provide the estimated number of bytes needed to hold + * a full column vector worth of byte string data. + * + * @param estimatedValueSize Estimated size of buffer space needed + */ + public void initBuffer(int estimatedValueSize) { + nextFree = 0; + + // if buffer is already allocated, keep using it, don't re-allocate + if (buffer != null) { + return; + } + + // allocate a little extra space to limit need to re-allocate + int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR); + if (bufferSize < DEFAULT_BUFFER_SIZE) { + bufferSize = DEFAULT_BUFFER_SIZE; + } + buffer = new byte[bufferSize]; + } + + /** + * Initialize buffer to default size. + */ + public void initBuffer() { + initBuffer(0); + } + + /** + * @return amount of buffer space currently allocated + */ + public int bufferSize() { + if (buffer == null) { + return 0; + } + return buffer.length; + } + + /** + * Set a field by actually copying in to a local buffer. + * If you must actually copy data in to the array, use this method. + * DO NOT USE this method unless it's not practical to set data by reference with setRef(). + * Setting data by reference tends to run a lot faster than copying data in. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { + if ((nextFree + length) > buffer.length) { + increaseBufferSpace(length); + } + System.arraycopy(sourceBuf, start, buffer, nextFree, length); + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = length; + nextFree += length; + } + + /** + * Increase buffer space enough to accommodate next element. + * This uses an exponential increase mechanism to rapidly + * increase buffer size to enough to hold all data. + * As batches get re-loaded, buffer space allocated will quickly + * stabilize. + * + * @param nextElemLength size of next element to be added + */ + public void increaseBufferSpace(int nextElemLength) { + + // Keep doubling buffer size until there will be enough space for next element. + int newLength = 2 * buffer.length; + while((nextFree + nextElemLength) > newLength) { + newLength *= 2; + } + + // Allocate new buffer, copy data to it, and set buffer to new buffer. + byte[] newBuffer = new byte[newLength]; + System.arraycopy(buffer, 0, newBuffer, 0, nextFree); + buffer = newBuffer; + } + + @Override + public Writable getWritableObject(int index) { + + // TODO finish this + throw new UnsupportedOperationException("unfinished"); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java new file mode 100644 index 0000000..8b4c615 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.io.Writable; + +/** + * ColumnVector contains the shared structure for the sub-types, + * including NULL information, and whether this vector + * repeats, i.e. has all values the same, so only the first + * one is set. This is used to accelerate query performance + * by handling a whole vector in O(1) time when applicable. + * + * The fields are public by design since this is a performance-critical + * structure that is used in the inner loop of query execution. + */ +public abstract class ColumnVector { + + /* + * If hasNulls is true, then this array contains true if the value + * is null, otherwise false. The array is always allocated, so a batch can be re-used + * later and nulls added. + */ + public boolean[] isNull; + + // If the whole column vector has no nulls, this is true, otherwise false. + public boolean noNulls; + + /* + * True if same value repeats for whole column vector. + * If so, vector[0] holds the repeating value. + */ + public boolean isRepeating; + public abstract Writable getWritableObject(int index); + + /** + * Constructor for super-class ColumnVector. This is not called directly, + * but used to initialize inherited fields. + * + * @param len Vector length + */ + public ColumnVector(int len) { + isNull = new boolean[len]; + noNulls = true; + isRepeating = false; + } +} + diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java new file mode 100644 index 0000000..d441845 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.io.Writable; + +/** + * This class represents a nullable double precision floating point column vector. + * This class will be used for operations on all floating point types (float, double) + * and as such will use a 64-bit double value to hold the biggest possible value. + * During copy-in/copy-out, smaller types (i.e. float) will be converted as needed. This will + * reduce the amount of code that needs to be generated and also will run fast since the + * machine operates with 64-bit words. + * + * The vector[] field is public by design for high-performance access in the inner + * loop of query execution. + */ +public class DoubleColumnVector extends ColumnVector { + public double[] vector; + private DoubleWritable writableObj = new DoubleWritable(); + + /** + * Use this constructor by default. All column vectors + * should normally be the default size. + */ + public DoubleColumnVector() { + this(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Don't use this except for testing purposes. + * + * @param len + */ + public DoubleColumnVector(int len) { + super(len); + vector = new double[len]; + } + + @Override + public Writable getWritableObject(int index) { + if (!noNulls && isNull[index]) { + return null; + } else { + writableObj.set(vector[index]); + return writableObj; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java new file mode 100644 index 0000000..00dcd85 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +/** + * This class represents a nullable int column vector. + * This class will be used for operations on all integer types (tinyint, smallint, int, bigint) + * and as such will use a 64-bit long value to hold the biggest possible value. + * During copy-in/copy-out, smaller int types will be converted as needed. This will + * reduce the amount of code that needs to be generated and also will run fast since the + * machine operates with 64-bit words. + * + * The vector[] field is public by design for high-performance access in the inner + * loop of query execution. + */ +public class LongColumnVector extends ColumnVector { + public long[] vector; + private LongWritable writableObj = new LongWritable(); + + /** + * Use this constructor by default. All column vectors + * should normally be the default size. + */ + public LongColumnVector() { + this(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Don't use this except for testing purposes. + * + * @param len + */ + public LongColumnVector(int len) { + super(len); + vector = new long[len]; + } + + @Override + public Writable getWritableObject(int index) { + if (!noNulls && isNull[index]) { + return null; + } else { + writableObj.set(vector[index]); + return writableObj; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java new file mode 100644 index 0000000..b4097ca --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java @@ -0,0 +1,165 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; + +/** + * A VectorizedRowBatch is a set of rows, organized with each column + * as a vector. It is the unit of query execution, organized to minimize + * the cost per row and achieve high cycles-per-instruction. + * The major fields are public by design to allow fast and convenient + * access by the vectorized query execution code. + */ +public class VectorizedRowBatch implements Writable { + public int numCols; // number of columns + public ColumnVector[] cols; // a vector for each column + public int size; // number of rows that qualify (i.e. haven't been filtered out) + public int[] selected; // array of positions of selected values + + /* + * If no filtering has been applied yet, selectedInUse is false, + * meaning that all rows qualify. If it is true, then the selected[] array + * records the offsets of qualifying rows. + */ + public boolean selectedInUse; + + // If this is true, then there is no data in the batch -- we have hit the end of input. + public boolean endOfFile; + + /* + * This number is carefully chosen to minimize overhead and typically allows + * one VectorizedRowBatch to fit in cache. + */ + public static final int DEFAULT_SIZE = 1024; + + private Writable[] writableRow; + private int rowIteratorIndex = 0; + + /** + * Return a batch with the specified number of columns. + * This is the standard constructor -- all batches should be the same size + * + * @param numCols the number of columns to include in the batch + */ + public VectorizedRowBatch(int numCols) { + this(numCols, DEFAULT_SIZE); + } + + /** + * Return a batch with the specified number of columns and rows. + * Only call this constructor directly for testing purposes. + * Batch size should normally always be defaultSize. + * + * @param numCols the number of columns to include in the batch + * @param size the number of rows to include in the batch + */ + public VectorizedRowBatch(int numCols, int size) { + this.numCols = numCols; + this.size = size; + selected = new int[size]; + selectedInUse = false; + this.cols = new ColumnVector[numCols]; + writableRow = new Writable[numCols]; + } + + public void initRowIterator(){ + this.rowIteratorIndex = 0; + } + + public Writable [] getNextRow() { + if (rowIteratorIndex >= size) { + return null; + } + if (selectedInUse) { + int i = selected[rowIteratorIndex]; + for (int c = 0; c < numCols; c++) { + writableRow[c] = cols[c].getWritableObject(i); + } + } else { + int i = rowIteratorIndex; + for (int c = 0; c < numCols; c++) { + writableRow[c] = cols[c].getWritableObject(i); + } + } + return writableRow; + } + + /** + * Return count of qualifying rows. + * + * @return number of rows that have not been filtered out + */ + public long count() { + return size; + } + + @Override + public String toString() { + if (size == 0) { + return ""; + } + StringBuilder b = new StringBuilder(); + if (this.selectedInUse) { + for (int j = 0; j < size; j++) { + int i = selected[j]; + int colIndex = 0; + for (ColumnVector cv : cols) { + b.append(cv.getWritableObject(i).toString()); + colIndex++; + if (colIndex < cols.length) { + b.append('\u0001'); + } + } + if (j < size-1) { + b.append('\n'); + } + } + } else { + for (int i = 0; i < size; i++) { + int colIndex = 0; + for (ColumnVector cv : cols) { + b.append(cv.getWritableObject(i).toString()); + colIndex++; + if (colIndex < cols.length) { + b.append('\u0001'); + } + } + if (i < size-1) { + b.append('\n'); + } + } + } + return b.toString(); + } + + @Override + public void readFields(DataInput arg0) throws IOException { + throw new UnsupportedOperationException("Do you really need me?"); + } + + @Override + public void write(DataOutput arg0) throws IOException { + throw new UnsupportedOperationException("Don't call me"); + } +} + diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java new file mode 100644 index 0000000..a250c9d --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java @@ -0,0 +1,221 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.Random; + +import junit.framework.Assert; +import org.junit.Test; + +/** + * Test creation and basic manipulation of VectorizedRowBatch. + */ +public class TestVectorizedRowBatch { + + // test fields + static final String[] COLORS = {"red", "yellow", "green", "blue", "violet", "orange"}; + private static byte[][] colorsBytes; + + private VectorizedRowBatch makeBatch() { + VectorizedRowBatch batch = new VectorizedRowBatch(3); + LongColumnVector lv = new LongColumnVector(); + DoubleColumnVector dv = new DoubleColumnVector(); + BytesColumnVector bv = new BytesColumnVector(); + setSampleStringCol(bv); + batch.cols[0] = lv; + batch.cols[1] = dv; + batch.cols[2] = bv; + addRandomNulls(batch); + return batch; + } + + @Test + /** + * Make sure you can create a batch and that all columns are the + * default size. + */ + public void testVectorizedRowBatchCreate() { + VectorizedRowBatch batch = makeBatch(); + Assert.assertEquals(3, batch.numCols); + Assert.assertEquals(VectorizedRowBatch.DEFAULT_SIZE, batch.size); + Assert.assertEquals(((LongColumnVector) batch.cols[0]).vector.length, + VectorizedRowBatch.DEFAULT_SIZE); + Assert.assertEquals(((DoubleColumnVector) batch.cols[1]).vector.length, + VectorizedRowBatch.DEFAULT_SIZE); + Assert.assertEquals(((BytesColumnVector) batch.cols[2]).vector.length, + VectorizedRowBatch.DEFAULT_SIZE); + } + + /* + * Test routines to exercise VectorizedRowBatch + * by filling column vectors with data and null values. + */ + + public static void setRandom(VectorizedRowBatch batch) { + batch.size = VectorizedRowBatch.DEFAULT_SIZE; + for (int i = 0; i != batch.numCols; i++) { + batch.cols[i] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE); + setRandomLongCol((LongColumnVector) batch.cols[i]); + } + } + + public static void setSample(VectorizedRowBatch batch) { + batch.size = VectorizedRowBatch.DEFAULT_SIZE; + for (int i = 0; i != batch.numCols; i++) { + setSampleLongCol((LongColumnVector) batch.cols[i]); + } + } + + /** + * Set to sample data, re-using existing columns in batch. + * + * @param batch + */ + public static void setSampleOverwrite(VectorizedRowBatch batch) { + + // Put sample data in the columns. + for (int i = 0; i != batch.numCols; i++) { + setSampleLongCol((LongColumnVector) batch.cols[i]); + } + + // Reset the selection vector. + batch.selectedInUse = false; + batch.size = VectorizedRowBatch.DEFAULT_SIZE; + } + + /** + * Sprinkle null values in this column vector. + * + * @param col + */ + public static void addRandomNulls(ColumnVector col) { + col.noNulls = false; + Random rand = new Random(); + for(int i = 0; i != col.isNull.length; i++) { + col.isNull[i] = Math.abs(rand.nextInt() % 11) == 0; + } + } + + /** + * Add null values, but do it faster, by avoiding use of Random(). + * + * @param col + */ + public void addSampleNulls(ColumnVector col) { + col.noNulls = false; + assert col.isNull != null; + for(int i = 0; i != col.isNull.length; i++) { + col.isNull[i] = i % 11 == 0; + } + } + + public static void addRandomNulls(VectorizedRowBatch batch) { + for (int i = 0; i != batch.numCols; i++) { + addRandomNulls(batch.cols[i]); + } + } + + public void addSampleNulls(VectorizedRowBatch batch) { + for (int i = 0; i != batch.numCols; i++) { + addSampleNulls(batch.cols[i]); + } + } + + /** + * Set vector elements to sample string data from colorsBytes string table. + * @param col + */ + public static void setSampleStringCol(BytesColumnVector col) { + initColors(); + int size = col.vector.length; + for(int i = 0; i != size; i++) { + int pos = i % colorsBytes.length; + col.setRef(i, colorsBytes[pos], 0, colorsBytes[pos].length); + } + } + + /* + * Initialize string table in a lazy fashion. + */ + private static void initColors() { + if (colorsBytes == null) { + colorsBytes = new byte[COLORS.length][]; + for (int i = 0; i != COLORS.length; i++) { + colorsBytes[i] = COLORS[i].getBytes(); + } + } + } + + + /** + * Set the vector to sample data that repeats an iteration from 0 to 99. + * @param col + */ + public static void setSampleLongCol(LongColumnVector col) { + int size = col.vector.length; + for(int i = 0; i != size; i++) { + col.vector[i] = i % 100; + } + } + + /** + * Set the vector to random data in the range 0 to 99. + * This has significant overhead for random number generation. Use setSample() to reduce overhead. + */ + public static void setRandomLongCol(LongColumnVector col) { + int size = col.vector.length; + Random rand = new Random(System.currentTimeMillis()); + for(int i = 0; i != size; i++) { + col.vector[i] = Math.abs(rand.nextInt() % 100); + } + } + + public static void setRepeatingLongCol(LongColumnVector col) { + col.isRepeating = true; + col.vector[0] = 50; + } + + /** + * Set the vector to sample data that repeats an iteration from 0 to 99. + * @param col + */ + public static void setSampleDoubleCol(DoubleColumnVector col) { + int size = col.vector.length; + for(int i = 0; i != size; i++) { + col.vector[i] = i % 100; + } + } + + /** + * Set the vector to random data in the range 0 to 99. + * This has significant overhead for random number generation. Use setSample() to reduce overhead. + */ + public static void setRandomDoubleCol(DoubleColumnVector col) { + int size = col.vector.length; + Random rand = new Random(); + for(int i = 0; i != size; i++) { + col.vector[i] = Math.abs(rand.nextInt() % 100); + } + } + + public static void setRepeatingDoubleCol(DoubleColumnVector col) { + col.isRepeating = true; + col.vector[0] = 50.0; + } +}