diff --git orc/src/java/org/apache/orc/impl/BitFieldReader.java orc/src/java/org/apache/orc/impl/BitFieldReader.java index 8d9d3cb..1317ab2 100644 --- orc/src/java/org/apache/orc/impl/BitFieldReader.java +++ orc/src/java/org/apache/orc/impl/BitFieldReader.java @@ -25,19 +25,20 @@ import org.apache.orc.impl.PositionProvider; import org.apache.orc.impl.RunLengthByteReader; +import com.google.common.base.Preconditions; + public class BitFieldReader { private final RunLengthByteReader input; - /** The number of bits in one item. Non-test code always uses 1. */ - private final int bitSize; private int current; private int bitsLeft; - private final int mask; - public BitFieldReader(InStream input, - int bitSize) throws IOException { + public BitFieldReader(InStream input) throws IOException { + this.input = new RunLengthByteReader(input); + } + + public BitFieldReader(InStream input, int bitSize) throws IOException { + Preconditions.checkArgument(bitSize == 1); this.input = new RunLengthByteReader(input); - this.bitSize = bitSize; - mask = (1 << bitSize) - 1; } public void setInStream(InStream inStream) { @@ -54,20 +55,54 @@ private void readByte() throws IOException { } public int next() throws IOException { - int result = 0; - int bitsLeftToRead = bitSize; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= current & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; + if (bitsLeft == 0) { + readByte(); + } + bitsLeft--; + return (current >>> bitsLeft) & 0x01; + } + + /** + * Specialized reader to read null bitsets fast + * @param isNull + * @param previousLen + * @return noNulls + * @throws IOException + */ + public boolean nextNulls(boolean isNull[], long previousLen) throws IOException { + boolean noNulls = true; + if (bitsLeft == 0) { readByte(); } - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= bitsLeftToRead; - result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1); + + if (previousLen % 8 != 0 || bitsLeft < 8) { + for (int i = 0; i < previousLen; i++) { + isNull[i] = (next() == 0); + } + for (int i = 0; i < previousLen; i++) { + noNulls = noNulls & (isNull[i] == false); + } + return noNulls; + } + // invariant: bitsleft == 8 & len is multiple of 8 (like 1024) + + for (int i = 0; i < previousLen; i += 8) { + if (bitsLeft == 0) { + readByte(); + } + isNull[i + 0] = 0 == (current & (1 << (7 - 0))); // 7 bits left (>>> 7 & 0x01) + isNull[i + 1] = 0 == (current & (1 << (7 - 1))); + isNull[i + 2] = 0 == (current & (1 << (7 - 2))); + isNull[i + 3] = 0 == (current & (1 << (7 - 3))); + isNull[i + 4] = 0 == (current & (1 << (7 - 4))); + isNull[i + 5] = 0 == (current & (1 << (7 - 5))); + isNull[i + 6] = 0 == (current & (1 << (7 - 6))); + isNull[i + 7] = 0 == (current & (1 << (7 - 7))); // no bits left (>>> 0 & 0x01) + noNulls = noNulls && (current == 0xff); // all bits present + bitsLeft = 0; } - return result & mask; + + return noNulls; } /** @@ -79,7 +114,6 @@ public int next() throws IOException { private boolean lastRunValue; private int lastRunLength = -1; private void readNextRun(int maxRunLength) throws IOException { - assert bitSize == 1; if (lastRunLength > 0) return; // last run is not exhausted yet if (bitsLeft == 0) { readByte(); @@ -171,7 +205,7 @@ public void seek(PositionProvider index) throws IOException { } public void skip(long items) throws IOException { - long totalBits = bitSize * items; + long totalBits = 1 * items; if (bitsLeft >= totalBits) { bitsLeft -= totalBits; } else { @@ -185,7 +219,7 @@ public void skip(long items) throws IOException { @Override public String toString() { return "bit reader current: " + current + " bits left: " + bitsLeft + - " bit size: " + bitSize + " from " + input; + " bit size: " + 1 + " from " + input; } boolean hasFullByte() { @@ -193,7 +227,6 @@ boolean hasFullByte() { } int peekOneBit() throws IOException { - assert bitSize == 1; if (bitsLeft == 0) { readByte(); } @@ -201,7 +234,6 @@ int peekOneBit() throws IOException { } int peekFullByte() throws IOException { - assert bitSize == 1; assert bitsLeft == 8 || bitsLeft == 0; if (bitsLeft == 0) { readByte(); diff --git orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java index 5f2a673..97c0f53 100644 --- orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java +++ orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java @@ -105,7 +105,7 @@ private void readDeltaValues(int firstByte) throws IOException { if (fd == 0) { isRepeating = true; assert numLiterals == 1; - Arrays.fill(literals, numLiterals, numLiterals + len, literals[0]); + // literals[0] has the value already numLiterals += len; } else { // add fixed deltas to adjacent values @@ -302,10 +302,7 @@ private void readShortRepeatValues(int firstByte) throws IOException { } // repeat the value for length times isRepeating = true; - // TODO: this is not so useful and V1 reader doesn't do that. Fix? Same if delta == 0 - for(int i = 0; i < len; i++) { - literals[i] = val; - } + literals[0] = val; numLiterals = len; } @@ -322,7 +319,9 @@ public long next() throws IOException { used = 0; readValues(false); } - result = literals[used++]; + final int off = (isRepeating) ? 0 : used; + used++; + result = literals[off]; return result; } @@ -361,25 +360,45 @@ public void skip(long numValues) throws IOException { @Override public void nextVector(LongColumnVector previous, long previousLen) throws IOException { - previous.isRepeating = true; - for (int i = 0; i < previousLen; i++) { - if (!previous.isNull[i]) { - previous.vector[i] = next(); + boolean repeating = true; + if (previousLen == 0) { + return; + } + if (previous.noNulls) { + if ((numLiterals - used) >= previousLen) { + // fast-fwd next() - readValues() resets this + if (this.isRepeating) { + previous.isRepeating = true; + previous.vector[0] = literals[0]; + } else { + System.arraycopy(literals, used, previous.vector, 0, (int) previousLen); + } + used += previousLen; } else { + // run out of literals + final long l0 = previous.vector[0] = next(); + for (int i = 1; i < previousLen; i++) { + final long l1 = previous.vector[i] = next(); + repeating = repeating && (l0 == l1); + } + previous.isRepeating = repeating; + } + } else { + final boolean n0 = previous.isNull[0]; + final long l0 = n0 ? 1 : next(); + + for (int i = 1; i < previousLen; i++) { + final boolean n1 = previous.isNull[i]; // The default value of null for int type in vectorized // processing is 1, so set that if the value is null - previous.vector[i] = 1; - } + final long l1 = previous.vector[i] = (n1 ? 1 : next()); - // The default value for nulls in Vectorization for int types is 1 - // and given that non null value can also be 1, we need to check for isNull also - // when determining the isRepeating flag. - if (previous.isRepeating - && i > 0 - && (previous.vector[i - 1] != previous.vector[i] || - previous.isNull[i - 1] != previous.isNull[i])) { - previous.isRepeating = false; + // The default value for nulls in Vectorization for int types is 1 + // and given that non null value can also be 1, we need to check for isNull also + // when determining the isRepeating flag. + repeating = repeating && (l0 == l1) && (n0 == n1); } + previous.isRepeating = repeating; } } diff --git orc/src/test/org/apache/orc/impl/TestBitFieldReader.java orc/src/test/org/apache/orc/impl/TestBitFieldReader.java index e4c6f6b..bfc535b 100644 --- orc/src/test/org/apache/orc/impl/TestBitFieldReader.java +++ orc/src/test/org/apache/orc/impl/TestBitFieldReader.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import org.apache.orc.CompressionCodec; +import org.junit.Ignore; import org.junit.Test; public class TestBitFieldReader { @@ -80,6 +81,68 @@ public void testCompressedSeek() throws Exception { } @Test + public void testNullReader() throws Exception { + TestInStream.OutputCollector collect = new TestInStream.OutputCollector(); + BitFieldWriter out = new BitFieldWriter( + new OutStream("test", 500, null, collect), 1); + /* Cases: + * 1024 x 1 + * 1024 x 0 + * 7 x 1 + 17 x 0 + * 1024 x (0|1) + */ + for (int i=0; i < 1024; i++) { + out.write(1); + } + for (int i=0; i < 1024; i++) { + out.write(0); + } + for (int i=0; i < 7; i++) { + out.write(1); + } + for (int i = 0; i < 17; i++) { + out.write(0); + } + for (int i=0; i < 1024; i++) { + out.write(i%2); + } + out.flush(); + ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size()); + collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size()); + inBuf.flip(); + BitFieldReader in = new BitFieldReader(InStream.create("test", + new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(), + null, 500), 1); + final boolean[] isNull = new boolean[1024]; + // 1024 x 1 + assertEquals(true, in.nextNulls(isNull, 1024)); + for(int i = 0; i < 1024; i++) { + assertEquals(false, isNull[i]); + } + // 1024 x 0 + assertEquals(false, in.nextNulls(isNull, 1024)); + for(int i = 0; i < 1024; i++) { + assertEquals(true, isNull[i]); + } + // 7 x 1 + assertEquals(true, in.nextNulls(isNull, 7)); + for(int i = 0; i < 7; i++) { + assertEquals(false, isNull[i]); + } + // 17 x 0 + assertEquals(false, in.nextNulls(isNull, 17)); + for(int i = 0; i < 17; i++) { + assertEquals(true, isNull[i]); + } + // 1024 x (0|1) + assertEquals(false, in.nextNulls(isNull, 1024)); + for(int i = 0; i < 1024; i++) { + assertEquals(i%2 == 0, isNull[i]); + } + } + + @Ignore("No support for 3-bit fields") + @Test public void testBiggerItems() throws Exception { TestInStream.OutputCollector collect = new TestInStream.OutputCollector(); final int COUNT = 16384; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java index 620ad53..23bf3f4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java @@ -244,13 +244,7 @@ public Object nextVector(Object previousVector, long batchSize) throws IOExcepti if (present != null) { // Set noNulls and isNull vector of the ColumnVector based on // present stream - result.noNulls = true; - for (int i = 0; i < batchSize; i++) { - result.isNull[i] = (present.next() != 1); - if (result.noNulls && result.isNull[i]) { - result.noNulls = false; - } - } + result.noNulls = present.nextNulls(result.isNull, batchSize); } else { // There is not present stream, this means that all the values are // present.