diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java index 0e19e59..ffdc995 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedBatchUtil.java @@ -28,6 +28,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; @@ -88,6 +90,28 @@ public static void AddRowToBatch(Object row, StructObjectInspector oi, int rowIn // NOTE: The default value for null fields in vectorization is 1 for int types, NaN for // float/double. String types have no default value for null. switch (poi.getPrimitiveCategory()) { + case BOOLEAN: { + LongColumnVector lcv = (LongColumnVector) batch.cols[i]; + if (writableCol != null) { + lcv.vector[rowIndex] = ((BooleanWritable) writableCol).get() ? 1 : 0; + lcv.isNull[rowIndex] = false; + } else { + lcv.vector[rowIndex] = 1; + SetNullColIsNullValue(lcv, rowIndex); + } + } + break; + case BYTE: { + LongColumnVector lcv = (LongColumnVector) batch.cols[i]; + if (writableCol != null) { + lcv.vector[rowIndex] = ((ByteWritable) writableCol).get(); + lcv.isNull[rowIndex] = false; + } else { + lcv.vector[rowIndex] = 1; + SetNullColIsNullValue(lcv, rowIndex); + } + } + break; case SHORT: { LongColumnVector lcv = (LongColumnVector) batch.cols[i]; if (writableCol != null) { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java index 1a57766..efcd158 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java @@ -197,8 +197,10 @@ public VectorizedRowBatch CreateVectorizedRowBatch() throws HiveException case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi; // Vectorization currently only supports the following data types: - // SHORT, INT, LONG, FLOAT, DOUBLE, STRING + // BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING switch (poi.getPrimitiveCategory()) { + case BOOLEAN: + case BYTE: case SHORT: case INT: case LONG: @@ -232,9 +234,6 @@ public VectorizedRowBatch CreateVectorizedRowBatch() throws HiveException return result; } - - - /** * Adds the row to the batch after deserializing the row * diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/BitFieldReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/BitFieldReader.java index 11c3fdf..16c063f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/BitFieldReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/BitFieldReader.java @@ -20,8 +20,10 @@ import java.io.EOFException; import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; + class BitFieldReader { - private RunLengthByteReader input; + private final RunLengthByteReader input; private final int bitSize; private int current; private int bitsLeft; @@ -60,6 +62,23 @@ int next() throws IOException { return result & mask; } + void nextVector(LongColumnVector previous, long previousLen) + throws IOException { + previous.isRepeating = true; + for (int i = 0; i < previousLen; i++) { + if (!previous.isNull[i]) { + previous.vector[i] = next(); + } else { + // The default value of null for int types in vectorized + // processing is 1, so set that if the value is null + previous.vector[i] = 1; + } + if (previous.isRepeating && i > 0 && (previous.vector[i-1] != previous.vector[i])) { + previous.isRepeating = false; + } + } + } + void seek(PositionProvider index) throws IOException { input.seek(index); int consumed = (int) index.getNext(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index f15779d..6a1efaa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -240,8 +240,19 @@ Object next(Object previous) throws IOException { @Override Object nextVector(Object previousVector, long batchSize) throws IOException { - throw new UnsupportedOperationException( - "NextVector is not supported operation on Boolean type"); + LongColumnVector result = null; + if (previousVector == null) { + result = new LongColumnVector(); + } else { + result = (LongColumnVector) previousVector; + } + + // Read present/isNull stream + super.nextVector(result, batchSize); + + // Read value entries based on isNull entries + reader.nextVector(result, batchSize); + return result; } } @@ -284,8 +295,19 @@ Object next(Object previous) throws IOException { @Override Object nextVector(Object previousVector, long batchSize) throws IOException { - throw new UnsupportedOperationException( - "NextVector is not supported operation for Byte type"); + LongColumnVector result = null; + if (previousVector == null) { + result = new LongColumnVector(); + } else { + result = (LongColumnVector) previousVector; + } + + // Read present/isNull stream + super.nextVector(result, batchSize); + + // Read value entries based on isNull entries + reader.nextVector(result, batchSize); + return result; } @Override diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthByteReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthByteReader.java index 678a2f5..23d54e1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthByteReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthByteReader.java @@ -20,6 +20,8 @@ import java.io.EOFException; import java.io.IOException; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; + /** * A reader that reads a sequence of bytes. A control byte is read before * each run with positive values 0 to 127 meaning 3 to 130 repetitions. If the @@ -82,6 +84,23 @@ byte next() throws IOException { return result; } + void nextVector(LongColumnVector previous, long previousLen) + throws IOException { + previous.isRepeating = true; + for (int i = 0; i < previousLen; i++) { + if (!previous.isNull[i]) { + previous.vector[i] = next(); + } else { + // The default value of null for int types in vectorized + // processing is 1, so set that if the value is null + previous.vector[i] = 1; + } + if (previous.isRepeating && i > 0 && (previous.vector[i-1] != previous.vector[i])) { + previous.isRepeating = false; + } + } + } + void seek(PositionProvider index) throws IOException { input.seek(index); int consumed = (int) index.getNext(); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java index e8c75c6..0fb6085 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.junit.Before; @@ -58,13 +59,17 @@ public void openFileSystem() throws Exception { } static class MyRecord { + private final Boolean bo; + private final Byte by; private final Integer i; private final Long l; private final Short s; private final Double d; private final String k; - MyRecord(Integer i, Long l, Short s, Double d, String k) { + MyRecord(Boolean bo, Byte by, Integer i, Long l, Short s, Double d, String k) { + this.bo = bo; + this.by = by; this.i = i; this.l = l; this.s = s; @@ -96,11 +101,11 @@ public void createFile() throws Exception { "Heaven,", "we", "were", "all", "going", "direct", "the", "other", "way"}; for (int i = 0; i < 21000; ++i) { - if ((i % 3) != 0) { - writer.addRow(new MyRecord(i, (long) 200, (short) (300 + i), (double) (400 + i), + if ((i % 7) != 0) { + writer.addRow(new MyRecord(((i % 3) == 0), (byte)(i % 5), i, (long) 200, (short) (300 + i), (double) (400 + i), words[r1.nextInt(words.length)])); } else { - writer.addRow(new MyRecord(i, (long) 200, null, null, null)); + writer.addRow(new MyRecord(null, null, i, (long) 200, null, null, null)); } } writer.close(); @@ -124,6 +129,12 @@ private void checkVectorizedReader() throws Exception { for (int j = 0; j < batch.cols.length; j++) { Object a = ((Writable) row.getFieldValue(j)); Object b = batch.cols[j].getWritableObject(i); + // Boolean values are stores a 1's and 0's and hence this special case is required. + if (a instanceof BooleanWritable) { + Long temp = (long) (((BooleanWritable) a).get() ? 1 : 0); + Assert.assertEquals(true, temp.toString().equals(b.toString())); + continue; + } if (null == a) { Assert.assertEquals(true, (b == null || (b instanceof NullWritable))); } else { @@ -134,17 +145,21 @@ private void checkVectorizedReader() throws Exception { // Check repeating Assert.assertEquals(false, batch.cols[0].isRepeating); - Assert.assertEquals(true, batch.cols[1].isRepeating); + Assert.assertEquals(false, batch.cols[1].isRepeating); Assert.assertEquals(false, batch.cols[2].isRepeating); - Assert.assertEquals(false, batch.cols[3].isRepeating); + Assert.assertEquals(true, batch.cols[3].isRepeating); Assert.assertEquals(false, batch.cols[4].isRepeating); + Assert.assertEquals(false, batch.cols[5].isRepeating); + Assert.assertEquals(false, batch.cols[6].isRepeating); // Check non null - Assert.assertEquals(true, batch.cols[0].noNulls); - Assert.assertEquals(true, batch.cols[1].noNulls); - Assert.assertEquals(false, batch.cols[2].noNulls); - Assert.assertEquals(false, batch.cols[3].noNulls); + Assert.assertEquals(false, batch.cols[0].noNulls); + Assert.assertEquals(false, batch.cols[1].noNulls); + Assert.assertEquals(true, batch.cols[2].noNulls); + Assert.assertEquals(true, batch.cols[3].noNulls); Assert.assertEquals(false, batch.cols[4].noNulls); + Assert.assertEquals(false, batch.cols[5].noNulls); + Assert.assertEquals(false, batch.cols[6].noNulls); } Assert.assertEquals(false, rr.hasNext()); }