diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java new file mode 100644 index 0000000..3e351b2 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -0,0 +1,172 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +/** + * This class supports string and binary data by value reference -- i.e. each field is + * explicitly present, as opposed to provided by a dictionary reference. + * In some cases, all the values will be in the same byte array to begin with, + * but this need not be the case. If each value is in a separate byte + * array to start with, or not all of the values are in the same original + * byte array, you can still assign data by reference into this column vector. + * This gives flexibility to use this in multiple situations. + *

+ * When setting data by reference, the caller + * is responsible for allocating the byte arrays used to hold the data. + * You can also set data by value, as long as you call the initBuffer() method first. + * You can mix "by value" and "by reference" in the same column vector, + * though that use is probably not typical. + */ +public class BytesColumnVector extends ColumnVector { + public byte[][] vector; + public int[] start; // start offset of each field + public int[] length; // length of each field + // If the value repeats for every entry, then it is stored in vector[0] + // and isRepeating from the superclass is set to true. + private byte[] buffer; // optional buffer to use when actually copying in data + private int nextFree; // next free position in buffer + + // Estimate that there will be 16 bytes per entry + static final int defaultBufferSize = 16 * VectorizedRowBatch.defaultSize; + + /** + * Use this constructor for normal operation. + * All column vectors should be the default size normally. + */ + public BytesColumnVector() { + this(VectorizedRowBatch.defaultSize); + } + + /** + * Don't call this constructor except for testing purposes. + * + * @param size number of elements in the column vector + */ + public BytesColumnVector(int size) { + super(size); + vector = new byte[size][]; + start = new int[size]; + length = new int[size]; + } + + /** Set a field by reference. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { + vector[elementNum] = sourceBuf; + this.start[elementNum] = start; + this.length[elementNum] = length; + } + + /** + * You must call initBuffer first before using setVal(). + * Provide the estimated number of bytes needed to hold + * a full column vector worth of byte string data. + * + * @param estimatedValueSize Estimated size of buffer space needed + */ + public void initBuffer(int estimatedValueSize) { + nextFree = 0; + // if buffer is already allocated, keep using it, don't re-allocate + if (buffer != null) { + return; + } + // allocate 20% extra space to limit need to re-allocate + int bufferSize = this.vector.length * (int)(estimatedValueSize * 1.2); + if (bufferSize < defaultBufferSize) { + bufferSize = defaultBufferSize; + } + buffer = new byte[bufferSize]; + } + + /** + * Initialize buffer to default size + */ + public void initBuffer() { + initBuffer(0); + } + + /** + * @return amount of buffer space currently allocated + */ + public int bufferSize() { + if (buffer == null) { + return 0; + } + return buffer.length; + } + + /** + * Set a field by actually copying in to a local buffer. + * If you must actually copy data in to the array, use this method. + * DO NOT USE this method unless it's not practical to set data by reference with setRef(). + * Setting data by reference tends to run a lot faster than copying data in. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { + if (nextFree + length > buffer.length) { + increaseBufferSpace(length); + } + System.arraycopy(sourceBuf, start, buffer, nextFree, length); + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = length; + nextFree += length; + } + + /** + * Increase buffer space enough to accommodate next element. + * This uses an exponential increase mechanism to rapidly + * increase buffer size to enough to hold all data. + * As batches get re-loaded, buffer space allocated will quickly + * stabilize. + * + * @param nextElemLength size of next element to be added + */ + public void increaseBufferSpace(int nextElemLength) { + // Keep doubling buffer size until there will be enough space for next element. + int newLength = 2 * buffer.length; + while(nextFree + nextElemLength > newLength) { + newLength *= 2; + } + // Allocate new buffer, copy data to it, and set buffer to new buffer. + byte[] newBuffer = new byte[newLength]; + System.arraycopy(buffer, 0, newBuffer, 0, nextFree); + buffer = newBuffer; + } + + @Override + public Writable getWritableObject(int index) { + // TODO finish this + assert false : "not implemented"; + return null; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java new file mode 100644 index 0000000..30f16a5 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.Random; + +import org.apache.hadoop.io.Writable; + +public abstract class ColumnVector { + public boolean[] isNull; // if hasNulls is true, then this array contains true if the value is null, otherwise false. + // The array is always allocated, so a batch can be re-used later and nulls added. + public boolean noNulls; // if the whole column vector has no nulls, this is true, otherwise false. + public boolean isRepeating; // true if same value repeats for whole column vector. If so, vector[0] holds the repeating value. + public void setRandom(){assert false;}; // set column vector to all random data + public void setSample(){assert false;}; // set column vector to sample data (a little faster) + public void setRepeating(){assert false;}; // set column vector to a single repeating value + public String getString(int i){return "ERROR: not implemented";} // get string representing the i'th element of vector + + public abstract Writable getWritableObject(int index); + + public ColumnVector(int len) { + isNull = new boolean[len]; + noNulls = true; + isRepeating = false; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java new file mode 100644 index 0000000..d45f76e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; +import java.util.Random; + +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.io.Writable; + +/** + * This class represents a nullable double precision floating point column vector. + * This class will be used for operations on all floating point types (float, double) + * and as such will use a 64-bit double value to hold the biggest possible value. + * During copy-in/copy-out, smaller types (i.e. float) will be converted as needed. This will + * reduce the amount of code that needs to be generated and also will run fast since the + * machine operates with 64-bit words. + */ +public class DoubleColumnVector extends ColumnVector { + public double[] vector; + public DoubleWritable writableObj = new DoubleWritable(); + + /** + * Use this constructor by default. All column vectors + * should normally be the default size. + */ + public DoubleColumnVector() { + this(VectorizedRowBatch.defaultSize); + } + + /** + * Note: don't use this except for testing purposes + * + * @param len + */ + public DoubleColumnVector(int len) + { + super(len); + vector = new double[len]; + } + + @Override + public Writable getWritableObject(int index) { + if (!noNulls && isNull[index]) { + return null; + } else { + writableObj.set(vector[index]); + return writableObj; + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java new file mode 100644 index 0000000..ef977cc --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; +import java.util.Random; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +/** + * This class represents a nullable int column vector. + * This class will be used for operations on all integer types (tinyint, smallint, int, bigint) + * and as such will use a 64-bit long value to hold the biggest possible value. + * During copy-in/copy-out, smaller int types will be converted as needed. This will + * reduce the amount of code that needs to be generated and also will run fast since the + * machine operates with 64-bit words. + */ +public class LongColumnVector extends ColumnVector { + public long[] vector; + public LongWritable writableObj = new LongWritable(); + + public LongColumnVector() { + this(VectorizedRowBatch.defaultSize); + } + + public LongColumnVector(int len) + { + super(len); + vector = new long[len]; + } + + @Override + public Writable getWritableObject(int index) { + if (!noNulls && isNull[index]) { + return null; + } else { + writableObj.set(vector[index]); + return writableObj; + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java new file mode 100644 index 0000000..62f83c6 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.vector; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.io.Writable; + +/** + * A VectorizedRowBatch is a set of rows, organized with each column + * as a vector. It is the unit of query execution, organized to minimize + * the cost per row and achieve high cycles-per-instruction. + * The major fields are public by design to allow fast and convenient + * access by the vectorized query execution code. + */ +public class VectorizedRowBatch implements Writable { + public int numCols; // number of columns + public ColumnVector[] cols; // a vector for each column + public int size; // number of rows that qualify (i.e. haven't been filtered out) + public int selected[]; // array of positions of selected values + public boolean selectedInUse; // if no filtering has been applied yet, selectedInUse is false, + // meaning that all rows qualify. If it is true, then the selected[] array + // records the offsets of qualifying rows. + public boolean EOF; // if this is true, then there is no data in the batch + // -- we have hit the end of input + public static final int defaultSize = 1024; // This number is carefully chosen to minimize + // overhead and typically allows one VectorizedRowBatch to fit in L1 cache. + + public Writable [] writableRow; + private int rowIteratorIndex = 0; + + /** + * Return a batch with the specified number of columns. + * This is the standard constructor -- all batches should be the same size + * + * @param numCols the number of columns to include in the batch + */ + public VectorizedRowBatch(int numCols) { + this(numCols, defaultSize); + } + + /** + * Return a batch with the specified number of columns and rows. + * Only call this constructor directly for testing purposes. + * Batch size should normally always be defaultSize. + * + * @param numCols the number of columns to include in the batch + * @param size the number of rows to include in the batch + */ + public VectorizedRowBatch (int numCols, int size) { + this.numCols = numCols; + this.size = size; + selected = new int[size]; + selectedInUse = false; + this.cols = new ColumnVector[numCols]; + writableRow = new Writable[numCols]; + } + + public void initRowIterator() { + this.rowIteratorIndex = 0; + } + + public Writable [] getNextRow() { + if (rowIteratorIndex >= size) { + return null; + } + if (selectedInUse) { + int i = selected[rowIteratorIndex]; + for (int c = 0; c < numCols; c++) { + writableRow[c] = cols[c].getWritableObject(i); + } + } else { + int i = rowIteratorIndex; + for (int c = 0; c < numCols; c++) { + writableRow[c] = cols[c].getWritableObject(i); + } + } + return writableRow; + } + + /** + * Return count of qualifying rows. + * + * @return number of rows that have not been filtered out + */ + public long count() { + return size; + } + + @Override + public String toString() { + if (size == 0) { + return ""; + } + StringBuilder b = new StringBuilder(); + if (this.selectedInUse) { + for (int j = 0; j < size; j++) { + int i = selected[j]; + int colIndex = 0; + for (ColumnVector cv : cols) { + b.append(cv.getWritableObject(i).toString()); + colIndex++; + if (colIndex < cols.length) { + b.append('\u0001'); + } + } + if (j < size-1) { + b.append('\n'); + } + } + } else { + for (int i = 0; i < size; i++) { + int colIndex = 0; + for (ColumnVector cv : cols) { + b.append(cv.getWritableObject(i).toString()); + colIndex++; + if (colIndex < cols.length) { + b.append('\u0001'); + } + } + if (i < size-1) { + b.append('\n'); + } + } + } + return b.toString(); + } + + @Override + public void readFields(DataInput arg0) throws IOException { + throw new IOException("Do you really need me?"); + } + + @Override + public void write(DataOutput arg0) throws IOException { + throw new IOException("Don't call me"); + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java new file mode 100644 index 0000000..27c138e --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java @@ -0,0 +1,229 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import java.util.Random; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import junit.framework.Assert; +import org.junit.Test; + +public class TestVectorizedRowBatch { + // test fields + static final String[] colors = { "red", "yellow", "green", "blue", "violet", "orange" }; + static byte[][] colorsBytes; + + private VectorizedRowBatch makeBatch() { + VectorizedRowBatch batch = new VectorizedRowBatch(3); + LongColumnVector lv = new LongColumnVector(); + DoubleColumnVector dv = new DoubleColumnVector(); + BytesColumnVector bv = new BytesColumnVector(); + setSampleStringCol(bv); + batch.cols[0] = lv; + batch.cols[1] = dv; + batch.cols[2] = bv; + addRandomNulls(batch); + return batch; + } + + @Test + /** + * Make sure you can create a batch and that all columns are the + * default size. + */ + public void testVectorizedRowBatchCreate() { + VectorizedRowBatch batch = makeBatch(); + Assert.assertEquals(3, batch.numCols); + Assert.assertEquals(VectorizedRowBatch.defaultSize, batch.size); + Assert.assertEquals(((LongColumnVector) batch.cols[0]).vector.length, + VectorizedRowBatch.defaultSize); + Assert.assertEquals(((DoubleColumnVector) batch.cols[1]).vector.length, + VectorizedRowBatch.defaultSize); + Assert.assertEquals(((BytesColumnVector) batch.cols[2]).vector.length, + VectorizedRowBatch.defaultSize); + } + + // Test routines to exercise VectorizedRowBatch + // by filling column vectors with data and null values. + + public static void setRandom(VectorizedRowBatch batch) { + batch.size = VectorizedRowBatch.defaultSize; + for (int i = 0; i != batch.numCols; i++) { + batch.cols[i] = new LongColumnVector(VectorizedRowBatch.defaultSize); + batch.cols[i].setRandom(); + } + } + + public static void setSample(VectorizedRowBatch batch) { + batch.size = VectorizedRowBatch.defaultSize; + for (int i = 0; i != batch.numCols; i++) + { + batch.cols[i].setSample(); + } + } + + /** + * set to sample data, re-using existing columns in batch + * + * @param batch + */ + public static void setSampleOverwrite(VectorizedRowBatch batch) { + // put sample data in the columns + for (int i = 0; i != batch.numCols; i++) + { + batch.cols[i].setSample(); + } + // reset the selection vector + batch.selectedInUse = false; + batch.size = VectorizedRowBatch.defaultSize; + } + + /** + * sprinkle null values in this column vector + * @param col + */ + public static void addRandomNulls(ColumnVector col) + { + col.noNulls = false; + Random rand = new Random(); + for(int i = 0; i != col.isNull.length; i++) + { + col.isNull[i] = Math.abs( rand.nextInt() % 11 ) == 0; + } + } + + /** + * add null values, but do it faster, by avoiding use of Random() + * @param col + */ + public void addSampleNulls(ColumnVector col) + { + col.noNulls = false; + assert col.isNull != null; + for(int i = 0; i != col.isNull.length; i++) + { + col.isNull[i] = i % 11 == 0; + } + } + + public static void addRandomNulls(VectorizedRowBatch batch) { + for (int i = 0; i != batch.numCols; i++) + { + addRandomNulls(batch.cols[i]); + } + } + + public void addSampleNulls(VectorizedRowBatch batch) { + for (int i = 0; i != batch.numCols; i++) + { + addSampleNulls(batch.cols[i]); + } + } + + /** + * Set vector elements to sample string data from colorsBytes string table. + * @param col + */ + public static void setSampleStringCol(BytesColumnVector col) { + initColors(); + int size = col.vector.length; + for(int i = 0; i != size; i++) + { + int pos = i % colorsBytes.length; + col.setRef(i, colorsBytes[pos], 0, colorsBytes[pos].length ); + } + } + + /** + * lazy initialization of string table + */ + private static void initColors() { + if (colorsBytes == null) { + colorsBytes = new byte[colors.length][]; + for (int i = 0; i != colors.length; i++) { + colorsBytes[i] = colors[i].getBytes(); + } + } + } + + + /** + * Set the vector to sample data that repeats an iteration from 0 to 99. + * @param col + */ + public static void setSampleLongCol(LongColumnVector col) + { + int size = col.vector.length; + for(int i = 0; i!=size; i++) + { + col.vector[i] = i % 100; + } + } + + /** + * Set the vector to random data in the range 0 to 99. + * This has significant overhead for random number generation. Use setSample() to reduce overhead. + */ + public static void setRandomLongCol(LongColumnVector col) { + int size = col.vector.length; + Random rand = new Random(System.currentTimeMillis()); + for(int i = 0; i!=size; i++) + { + col.vector[i] = Math.abs( rand.nextInt() % 100 ); + } + } + + public static void setRepeatingLongCol(LongColumnVector col) { + col.isRepeating = true; + col.vector[0] = 50; + } + + /** + * Set the vector to sample data that repeats an iteration from 0 to 99. + * @param col + */ + public static void setSampleDoubleCol(DoubleColumnVector col) + { + int size = col.vector.length; + for(int i = 0; i!=size; i++) + { + col.vector[i] = i % 100; + } + } + + /** + * Set the vector to random data in the range 0 to 99. + * This has significant overhead for random number generation. Use setSample() to reduce overhead. + */ + public static void setRandomDoubleCol(DoubleColumnVector col) + { + int size = col.vector.length; + Random rand = new Random(); + for(int i = 0; i!=size; i++) + { + col.vector[i] = Math.abs( rand.nextInt() % 100 ); + } + } + + public static void setRepeatingDoubleCol(DoubleColumnVector col) { + col.isRepeating = true; + col.vector[0] = 50.0; + } +}