diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java index 5bd4599..5e29228 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java @@ -418,71 +418,73 @@ private void writeShortRepeatValues() throws IOException { private void determineEncoding() { - int idx = 0; + // we need to compute zigzag values for DIRECT encoding if we decide to + // break early for delta overflows or for shorter runs + computeZigZagLiterals(); + + double p = 1.0; + zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, p); + + // not a big win for shorter runs to determine encoding + if (numLiterals <= MIN_REPEAT) { + encoding = EncodingType.DIRECT; + return; + } + + // DELTA encoding check // for identifying monotonic sequences boolean isIncreasing = false; int increasingCount = 1; boolean isDecreasing = false; int decreasingCount = 1; - - // for identifying type of delta encoding - min = literals[0]; - long max = literals[0]; isFixedDelta = true; long currDelta = 0; - - min = literals[0]; long deltaMax = 0; + boolean deltaOverflow = false; // populate all variables to identify the encoding type - if (numLiterals >= 1) { - currDelta = literals[1] - literals[0]; - for(int i = 0; i < numLiterals; i++) { - if (i > 0 && literals[i] >= max) { - max = literals[i]; - increasingCount++; + if (!utils.isSafeSubtract(literals[1], literals[0])) { + deltaOverflow = true; + } + + currDelta = literals[1] - literals[0]; + + for(int i = 0; i < numLiterals && !deltaOverflow; i++) { + + // if delta doesn't changes then mark it as fixed delta + if (i > 0 && isFixedDelta) { + if (!utils.isSafeSubtract(literals[i], literals[i - 1])) { + deltaOverflow = true; } - if (i > 0 && literals[i] <= min) { - min = literals[i]; - decreasingCount++; + if (literals[i] - literals[i - 1] != currDelta) { + isFixedDelta = false; } - // if delta doesn't changes then mark it as fixed delta - if (i > 0 && isFixedDelta) { - if (literals[i] - literals[i - 1] != currDelta) { - isFixedDelta = false; - } + fixedDelta = currDelta; + } - fixedDelta = currDelta; + // max delta value is required for computing the fixed bits + // required for delta blob in delta encoding + if (i > 0) { + if (!utils.isSafeSubtract(literals[i], literals[i - 1])) { + deltaOverflow = true; } - // populate zigzag encoded literals - long zzEncVal = 0; - if (signed) { - zzEncVal = utils.zigzagEncode(literals[i]); + if (i == 1) { + // first value preserve the sign + adjDeltas[i - 1] = literals[i] - literals[i - 1]; } else { - zzEncVal = literals[i]; - } - zigzagLiterals[idx] = zzEncVal; - idx++; - - // max delta value is required for computing the fixed bits - // required for delta blob in delta encoding - if (i > 0) { - if (i == 1) { - // first value preserve the sign - adjDeltas[i - 1] = literals[i] - literals[i - 1]; - } else { - adjDeltas[i - 1] = Math.abs(literals[i] - literals[i - 1]); - if (adjDeltas[i - 1] > deltaMax) { - deltaMax = adjDeltas[i - 1]; - } + adjDeltas[i - 1] = Math.abs(literals[i] - literals[i - 1]); + if (adjDeltas[i - 1] > deltaMax) { + deltaMax = adjDeltas[i - 1]; } } } + } + if (!deltaOverflow) { // stores the number of bits required for packing delta blob in // delta encoding bitsDeltaMax = utils.findClosestNumBits(deltaMax); @@ -498,47 +500,57 @@ private void determineEncoding() { if (decreasingCount == 1 && increasingCount == numLiterals) { isIncreasing = true; } - } - // if the sequence is both increasing and decreasing then it is not - // monotonic - if (isDecreasing && isIncreasing) { - isDecreasing = false; - isIncreasing = false; - } + // if the sequence is both increasing and decreasing then it is not + // monotonic + if (isDecreasing && isIncreasing) { + isDecreasing = false; + isIncreasing = false; + } - // fixed delta condition - if (isIncreasing == false && isDecreasing == false && isFixedDelta == true) { - encoding = EncodingType.DELTA; - return; - } + // fixed delta condition + if (isIncreasing == false && isDecreasing == false && isFixedDelta == true) { + encoding = EncodingType.DELTA; + return; + } - // monotonic condition - if (isIncreasing || isDecreasing) { - encoding = EncodingType.DELTA; - return; + // monotonic condition + if (isIncreasing || isDecreasing) { + encoding = EncodingType.DELTA; + return; + } } + // PATCHED_BASE encoding check + // percentile values are computed for the zigzag encoded values. if the // number of bit requirement between 90th and 100th percentile varies // beyond a threshold then we need to patch the values. if the variation - // is not significant then we can use direct or delta encoding + // is not significant then we can use direct encoding - double p = 0.9; + p = 0.9; zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, p); - - p = 1.0; - zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, p); - int diffBitsLH = zzBits100p - zzBits90p; // if the difference between 90th percentile and 100th percentile fixed // bits is > 1 then we need patch the values - if (isIncreasing == false && isDecreasing == false && diffBitsLH > 1 - && isFixedDelta == false) { + if (diffBitsLH > 1) { + + // min value is required for base reduction + min = literals[0]; + for (int i = 1; i < numLiterals; i++) { + if (literals[i] < min) { + min = literals[i]; + } + } + // patching is done only on base reduced values. // remove base from literals for(int i = 0; i < numLiterals; i++) { + if (!utils.isSafeSubtract(literals[i], min)) { + encoding = EncodingType.DIRECT; + return; + } baseRedLiterals[i] = literals[i] - min; } @@ -565,19 +577,24 @@ private void determineEncoding() { encoding = EncodingType.DIRECT; return; } - } - - // if difference in bits between 95th percentile and 100th percentile is - // 0, then patch length will become 0. Hence we will fallback to direct - if (isIncreasing == false && isDecreasing == false && diffBitsLH <= 1 - && isFixedDelta == false) { + } else { + // if difference in bits between 95th percentile and 100th percentile is + // 0, then patch length will become 0. Hence we will fallback to direct encoding = EncodingType.DIRECT; return; } + } - // this should not happen - if (encoding == null) { - throw new RuntimeException("Integer encoding cannot be determined."); + private void computeZigZagLiterals() { + // populate zigzag encoded literals + long zzEncVal = 0; + for (int i = 0; i < numLiterals; i++) { + if (signed) { + zzEncVal = utils.zigzagEncode(literals[i]); + } else { + zzEncVal = literals[i]; + } + zigzagLiterals[i] = zzEncVal; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java index b5380c0..b14fa7b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java @@ -1283,4 +1283,9 @@ private long readLongBE8(InStream in, int rbOffset) { + ((readBuffer[rbOffset + 7] & 255) << 0)); } + // Do not want to use Guava LongMath.checkedSubtract() here as it will throw + // ArithmeticException in case of overflow + public boolean isSafeSubtract(long left, long right) { + return (left ^ right) >= 0 | (left ^ (left - right)) >= 0; + } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java index 6d6f132..0f606a4 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java @@ -335,6 +335,104 @@ public void testBasicDelta4() throws Exception { } @Test + public void testDeltaOverflow() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory + .getReflectionObjectInspector(Long.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + + long[] inp = new long[]{4513343538618202719l, 4513343538618202711l, + 2911390882471569739l, + -9181829309989854913l}; + List input = Lists.newArrayList(Longs.asList(inp)); + + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000) + .compress(CompressionKind.NONE).bufferSize(10000)); + for (Long l : input) { + writer.addRow(l); + } + writer.close(); + + Reader reader = OrcFile + .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + int idx = 0; + while (rows.hasNext()) { + Object row = rows.next(null); + assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + } + } + + @Test + public void testDeltaOverflow2() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory + .getReflectionObjectInspector(Long.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + + long[] inp = new long[]{Long.MAX_VALUE, 4513343538618202711l, + 2911390882471569739l, + Long.MIN_VALUE}; + List input = Lists.newArrayList(Longs.asList(inp)); + + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000) + .compress(CompressionKind.NONE).bufferSize(10000)); + for (Long l : input) { + writer.addRow(l); + } + writer.close(); + + Reader reader = OrcFile + .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + int idx = 0; + while (rows.hasNext()) { + Object row = rows.next(null); + assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + } + } + + @Test + public void testDeltaOverflow3() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory + .getReflectionObjectInspector(Long.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + + long[] inp = new long[]{-4513343538618202711l, -2911390882471569739l, -2, + Long.MAX_VALUE}; + List input = Lists.newArrayList(Longs.asList(inp)); + + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000) + .compress(CompressionKind.NONE).bufferSize(10000)); + for (Long l : input) { + writer.addRow(l); + } + writer.close(); + + Reader reader = OrcFile + .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + int idx = 0; + while (rows.hasNext()) { + Object row = rows.next(null); + assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + } + } + + @Test public void testIntegerMin() throws Exception { ObjectInspector inspector; synchronized (TestOrcFile.class) { diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestSerializationUtils.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestSerializationUtils.java index 4a49f09..b3f9cf1 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestSerializationUtils.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestSerializationUtils.java @@ -17,15 +17,18 @@ */ package org.apache.hadoop.hive.ql.io.orc; -import org.junit.Test; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.math.BigInteger; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; +import org.junit.Test; + +import com.google.common.math.LongMath; public class TestSerializationUtils { @@ -112,6 +115,47 @@ public void testBigIntegers() throws Exception { SerializationUtils.readBigInteger(fromBuffer(buffer))); } + @Test + public void testSubtractionOverflow() { + // cross check results with Guava results below + SerializationUtils utils = new SerializationUtils(); + assertEquals(false, utils.isSafeSubtract(22222222222L, Long.MIN_VALUE)); + assertEquals(false, utils.isSafeSubtract(-22222222222L, Long.MAX_VALUE)); + assertEquals(false, utils.isSafeSubtract(Long.MIN_VALUE, Long.MAX_VALUE)); + assertEquals(true, utils.isSafeSubtract(-1553103058346370095L, 6553103058346370095L)); + assertEquals(true, utils.isSafeSubtract(0, Long.MAX_VALUE)); + assertEquals(true, utils.isSafeSubtract(Long.MIN_VALUE, 0)); + } + + @Test + public void testSubtractionOverflowGuava() { + try { + LongMath.checkedSubtract(22222222222L, Long.MIN_VALUE); + fail("expected ArithmeticException for overflow"); + } catch (ArithmeticException ex) { + assertEquals(ex.getMessage(), "overflow"); + } + + try { + LongMath.checkedSubtract(-22222222222L, Long.MAX_VALUE); + fail("expected ArithmeticException for overflow"); + } catch (ArithmeticException ex) { + assertEquals(ex.getMessage(), "overflow"); + } + + try { + LongMath.checkedSubtract(Long.MIN_VALUE, Long.MAX_VALUE); + fail("expected ArithmeticException for overflow"); + } catch (ArithmeticException ex) { + assertEquals(ex.getMessage(), "overflow"); + } + + assertEquals(-8106206116692740190L, + LongMath.checkedSubtract(-1553103058346370095L, 6553103058346370095L)); + assertEquals(-Long.MAX_VALUE, LongMath.checkedSubtract(0, Long.MAX_VALUE)); + assertEquals(Long.MIN_VALUE, LongMath.checkedSubtract(Long.MIN_VALUE, 0)); + } + public static void main(String[] args) throws Exception { TestSerializationUtils test = new TestSerializationUtils(); test.testDoubles();