diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java index a210451..1087622 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -208,7 +208,8 @@ public Writable getWritableObject(int index) { } Writable result = null; if (!isNull[index] && vector[index] != null) { - result = new Text(vector[index]); + result = new Text(); + ((Text) result).append(vector[index], start[index], length[index]); } else { result = NullWritable.get(); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index e43c1a0..f15779d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -930,34 +930,42 @@ Object nextVector(Object previousVector, long batchSize) throws IOException { // Read present/isNull stream super.nextVector(result, batchSize); - byte[] dictionaryBytes = dictionaryBuffer.get(); - - // Read string offsets - scratchlcv.isNull = result.isNull; - reader.nextVector(scratchlcv, batchSize); - if (!scratchlcv.isRepeating) { - - // The vector has non-repeating strings. Iterate thru the batch - // and set strings one by one - for (int i = 0; i < batchSize; i++) { - if (!scratchlcv.isNull[i]) { - offset = dictionaryOffsets[(int) scratchlcv.vector[i]]; - length = getDictionaryEntryLength((int) scratchlcv.vector[i], offset); - result.setRef(i, dictionaryBytes, offset, length); - } else { - // If the value is null then set offset and length to zero (null string) - result.setRef(i, dictionaryBytes, 0, 0); + if (dictionaryBuffer != null) { + byte[] dictionaryBytes = dictionaryBuffer.get(); + + // Read string offsets + scratchlcv.isNull = result.isNull; + reader.nextVector(scratchlcv, batchSize); + if (!scratchlcv.isRepeating) { + + // The vector has non-repeating strings. Iterate thru the batch + // and set strings one by one + for (int i = 0; i < batchSize; i++) { + if (!scratchlcv.isNull[i]) { + offset = dictionaryOffsets[(int) scratchlcv.vector[i]]; + length = getDictionaryEntryLength((int) scratchlcv.vector[i], offset); + result.setRef(i, dictionaryBytes, offset, length); + } else { + // If the value is null then set offset and length to zero (null string) + result.setRef(i, dictionaryBytes, 0, 0); + } } + } else { + // If the value is repeating then just set the first value in the + // vector and set the isRepeating flag to true. No need to iterate thru and + // set all the elements to the same value + offset = dictionaryOffsets[(int) scratchlcv.vector[0]]; + length = getDictionaryEntryLength((int) scratchlcv.vector[0], offset); + result.setRef(0, dictionaryBytes, offset, length); } + result.isRepeating = scratchlcv.isRepeating; } else { - // If the value is repeating then just set the first value in the - // vector and set the isRepeating flag to true. No need to iterate thru and - // set all the elements to the same value - offset = dictionaryOffsets[(int) scratchlcv.vector[0]]; - length = getDictionaryEntryLength((int) scratchlcv.vector[0], offset); - result.setRef(0, dictionaryBytes, offset, length); - } - result.isRepeating = scratchlcv.isRepeating; + // Entire stripe contains null strings. + result.isRepeating = true; + result.noNulls = false; + result.isNull[0] = true; + result.setRef(0, "".getBytes(), 0, 0); + } return result; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerReader.java index 0b987ce..b30895f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerReader.java @@ -101,7 +101,7 @@ void nextVector(LongColumnVector previous, long previousLen) // processing is 1, so set that if the value is null previous.vector[i] = 1; } - if (previous.isRepeating && (delta != 0 || !repeat)) { + if (previous.isRepeating && i > 0 && (previous.vector[i-1] != previous.vector[i])) { previous.isRepeating = false; } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java index f87763f..e8c75c6 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.junit.Before; import org.junit.Test; @@ -124,7 +125,7 @@ private void checkVectorizedReader() throws Exception { Object a = ((Writable) row.getFieldValue(j)); Object b = batch.cols[j].getWritableObject(i); if (null == a) { - Assert.assertEquals(true, (b == null)); + Assert.assertEquals(true, (b == null || (b instanceof NullWritable))); } else { Assert.assertEquals(true, b.toString().equals(a.toString())); }