Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java (revision 1528177) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java (working copy) @@ -19,22 +19,23 @@ import java.io.IOException; import java.io.OutputStream; +import java.math.BigDecimal; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector; import org.apache.hadoop.io.LongWritable; /** * LazyObject for storing a value of Long. - * + * *

* Part of the code is adapted from Apache Harmony Project. - * + * * As with the specification, this implementation relied on code laid out in Henry S. Warren, Jr.'s Hacker's * Delight, (Addison Wesley, 2002) as well as The Aggregate's Magic Algorithms. *

- * + * */ public class LazyLong extends LazyPrimitive { @@ -64,7 +65,7 @@ * Parses the string argument as if it was a long value and returns the * result. Throws NumberFormatException if the string does not represent a * long quantity. - * + * * @param bytes * @param start * @param length @@ -82,7 +83,7 @@ * result. Throws NumberFormatException if the string does not represent an * long quantity. The second argument specifies the radix to use when parsing * the value. - * + * * @param bytes * @param start * @param length @@ -117,11 +118,135 @@ } /** + * Function to check if the part post exponent is represented correctly. + * For now, we will allow only exponents between Long.MIN_VALUE and + * Long.MAX_VALUE. + * Eg. If the number is 1E+5, the fn returns 5. + * If the number is 1E-5, the fn returns -5. + * @param bytes + * byte representation of the original number + * @param start + * start of the original number + * @param end + * end position of the original number. + * @param offset + * the starting position after the exponent symbol (if exists) + * @param radix + * the base to use for conversion. + * @param length + * a UTF-8 encoded string representation of an long quantity for the original number. + * @return the integer value post exponent symbol. + * @exception NumberFormatException + * if the argument could not be parsed as an long quantity. + */ + private static long parsePostExponent(byte[] bytes, int start, int end, int offset, + int radix, int length) { + boolean negative = bytes[offset] == '-'; + long max = Long.MIN_VALUE / radix; + long result = 0; + + if (negative || bytes[offset] == '+') { + offset++; + } + + while (offset < end) { + int digit = LazyUtils.digit(bytes[offset++], radix); + if (digit == -1) { + // If we encountered a character other than '0'-'9' + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + // Overflow + if (max > result) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + long next = result * radix - digit; + // Underflow + if (next > result) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + result = next; + } + if (!negative) { + result = -result; + if (result < 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + } + return result; + } + + /** + * Function to check if the fractional part post decimal is represented correctly. + * The value returned is the resultSoFar when the fractional part contain only digits. + * The value returned is the evaluated result when fractional part has exponential notation. + * For e.g. 1. if number = 1.2e+5, the value returned will be 120000 + * 2. if number = 1.2e-5, the value returned will be 0 (0.000012) + * 3. if number = 1.2345 the value returned will be 1 + * @param bytes + * byte representation of the original number + * @param start + * start of the original number + * @param end + * end position of the original number. + * @param offset + * the starting position after the exponent symbol (if exists) + * @param radix + * the base to use for conversion. + * @param length + * a UTF-8 encoded string representation of an int quantity for the original number. + * @param resultSoFar + * result of the integer part. + * @return the integer value post exponent symbol. + * @exception NumberFormatException + * if the argument could not be parsed as an int quantity. + */ + private static BigDecimal parsePostDecimal(byte[] bytes, int start, int end, int offset, + int radix, int length, long resultSoFar) { + double result = Math.abs(resultSoFar); + int pos = 0; + + // This is the case when we've encountered a decimal separator. The fractional + // part will not change the number unless we encounter a exponential notation, + // but we will verify that the fractional part is well formed. + while (offset < end) { + int digit = LazyUtils.digit(bytes[offset++], radix); + // Process the remaining part if we hit an exponential notation symbol + if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') { + long exponent = parsePostExponent(bytes, start, end, offset, radix, length); + // Don't allow numbers to end with exponent symbol + if (exponent == 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + double RadixPowExp = Math.pow(radix, exponent); + double finalResult = result * RadixPowExp * (resultSoFar <= 0 ? -1 : 1); + return BigDecimal.valueOf(finalResult); + } + // Invalid format! + if (digit == -1) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + // result is stored as a double till the final conversion to prevent loss-less transformation + // For example, 1.3e+4 should return 13000 + result += (digit*Math.pow(radix, -1*(++pos))); + } + + // If we reached here, it means the fractional part is formatted correctly and contains only + // digits from '0'-'9'. + return BigDecimal.valueOf(resultSoFar); + } + + /** * /** Parses the string argument as if it was an long value and returns the * result. Throws NumberFormatException if the string does not represent an * long quantity. The second argument specifies the radix to use when parsing * the value. - * + * * @param bytes * @param start * @param length @@ -140,16 +265,29 @@ int radix, boolean negative) { byte separator = '.'; long max = Long.MIN_VALUE / radix; - long result = 0, end = start + length; + long result = 0; + int end = start + length; + boolean isExponential = false; + boolean isDecimal = false; while (offset < end) { int digit = LazyUtils.digit(bytes[offset++], radix); if (digit == -1 || max > result) { + // If we hit decimal separator, break. if (bytes[offset-1] == separator) { // We allow decimals and will return a truncated integer in that case. // Therefore we won't throw an exception here (checking the fractional // part happens below.) + isDecimal = true; break; } + // If we hit an exponential notation, break. + if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') { + // Exponential notations are represented as 1e+5, 1e-5, 1e5, etc. + // i.e. whatever follows e or E should be a pure integer with only + // only option sign and decimal digits. + isExponential = true; + break; + } throw new NumberFormatException(LazyUtils.convertToString(bytes, start, length)); } @@ -161,17 +299,46 @@ result = next; } - // This is the case when we've encountered a decimal separator. The fractional - // part will not change the number, but we will verify that the fractional part - // is well formed. - while (offset < end) { - int digit = LazyUtils.digit(bytes[offset++], radix); - if (digit == -1) { + // Process the post exponent part + if (isExponential) { + long exponent = parsePostExponent(bytes, start, end, offset, radix, length); + double RadixPowExp = Math.pow(radix, exponent); + double finalResult = result * RadixPowExp; + if (!negative) { + finalResult = -finalResult; + if (finalResult < 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + } + // Check for overflow and underflow. + if (finalResult > Long.MAX_VALUE || finalResult < Long.MIN_VALUE) { throw new NumberFormatException(LazyUtils.convertToString(bytes, start, length)); } + return (long)(finalResult); } + // Process the post decimal part + if (isDecimal) { + BigDecimal value = parsePostDecimal(bytes, start, end, offset, radix, length, result); + if (!negative) { + value = value.negate(); + if (value.signum() == -1) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + } + // Check for overflow and underflow. + if (value.compareTo(BigDecimal.valueOf(Long.MAX_VALUE)) > 0 || + value.compareTo(BigDecimal.valueOf(Long.MIN_VALUE)) < 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + return value.longValue(); + } + + // No exponent part, no decimal part. if (!negative) { result = -result; if (result < 0) { @@ -185,11 +352,11 @@ /** * Writes out the text representation of an integer using base 10 to an * OutputStream in UTF-8 encoding. - * + * * Note: division by a constant (like 10) is much faster than division by a * variable. That's one of the reasons that we don't make radix a parameter * here. - * + * * @param out * the outputstream to write to * @param i Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java (revision 1528177) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java (working copy) @@ -19,22 +19,23 @@ import java.io.IOException; import java.io.OutputStream; +import java.math.BigDecimal; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyIntObjectInspector; import org.apache.hadoop.io.IntWritable; /** * LazyObject for storing a value of Integer. - * + * *

* Part of the code is adapted from Apache Harmony Project. - * + * * As with the specification, this implementation relied on code laid out in Henry S. Warren, Jr.'s Hacker's * Delight, (Addison Wesley, 2002) as well as The Aggregate's Magic Algorithms. *

- * + * */ public class LazyInteger extends LazyPrimitive { @@ -64,7 +65,7 @@ * Parses the string argument as if it was an int value and returns the * result. Throws NumberFormatException if the string does not represent an * int quantity. - * + * * @param bytes * @param start * @param length @@ -82,7 +83,7 @@ * result. Throws NumberFormatException if the string does not represent an * int quantity. The second argument specifies the radix to use when parsing * the value. - * + * * @param bytes * @param start * @param length @@ -117,10 +118,134 @@ } /** - * + * Function to check if the part post exponent is represented correctly. + * For now, we will allow only exponents between Integer.MIN_VALUE and + * Integer.MAX_VALUE. + * Eg. If the number is 1E+5, the fn returns 5. + * If the number is 1E-5, the fn returns -5. * @param bytes + * byte representation of the original number * @param start + * start of the original number + * @param end + * end position of the original number. + * @param offset + * the starting position after the exponent symbol (if exists) + * @param radix + * the base to use for conversion. * @param length + * a UTF-8 encoded string representation of an int quantity for the original number. + * @return the integer value post exponent symbol. + * @exception NumberFormatException + * if the argument could not be parsed as an int quantity. + */ + private static int parsePostExponent(byte[] bytes, int start, int end, int offset, + int radix, int length) { + boolean negative = bytes[offset] == '-'; + int max = Integer.MIN_VALUE / radix; + int result = 0; + + if (negative || bytes[offset] == '+') { + offset++; + } + + while (offset < end) { + int digit = LazyUtils.digit(bytes[offset++], radix); + if (digit == -1) { + // If we encountered a character other than '0'-'9' + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + // Overflow + if (max > result) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + int next = result * radix - digit; + // Underflow + if (next > result) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + result = next; + } + if (!negative) { + result = -result; + if (result < 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + } + return result; + } + + /** + * Function to check if the fractional part post decimal is represented correctly. + * The value returned is the resultSoFar when the fractional part contain only digits. + * The value returned is the evaluated result when fractional part has exponential notation. + * For e.g. 1. if number = 1.2e+5, the value returned will be 120000 + * 2. if number = 1.2e-5, the value returned will be 0 (0.000012) + * 3. if number = 1.2345 the value returned will be 1 + * @param bytes + * byte representation of the original number + * @param start + * start of the original number + * @param end + * end position of the original number. + * @param offset + * the starting position after the exponent symbol (if exists) + * @param radix + * the base to use for conversion. + * @param length + * a UTF-8 encoded string representation of an int quantity for the original number. + * @param resultSoFar + * result of the non-decimal part. + * @return the integer value post exponent symbol. + * @exception NumberFormatException + * if the argument could not be parsed as an int quantity. + */ + private static BigDecimal parsePostDecimal(byte[] bytes, int start, int end, int offset, + int radix, int length, int resultSoFar) { + double result = Math.abs(resultSoFar); + int pos = 0; + + // This is the case when we've encountered a decimal separator. The fractional + // part will not change the number unless we encounter a exponential notation, + // but we will verify that the fractional part is well formed. + while (offset < end) { + int digit = LazyUtils.digit(bytes[offset++], radix); + // Process the remaining part if we hit an exponential notation symbol + if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') { + int exponent = parsePostExponent(bytes, start, end, offset, radix, length); + // Don't allow numbers to end with exponent symbol + if (exponent == 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + double RadixPowExp = Math.pow(radix, exponent); + double finalResult = result * RadixPowExp * (resultSoFar <= 0 ? -1 : 1); + return BigDecimal.valueOf(finalResult); + } + // Invalid format! + if (digit == -1) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + // result is stored as a double till the final conversion to prevent loss-less transformation + // For example, 1.3e+4 should return 13000 + result += (digit*Math.pow(radix, -1*(++pos))); + } + + // If we reached here, it means the fractional part is formatted correctly and contains only + // digits from '0'-'9'. + return BigDecimal.valueOf(resultSoFar); + } + + /** + * + * @param bytes + * @param start + * @param length * a UTF-8 encoded string representation of an int quantity. * @param radix * the base to use for conversion. @@ -139,15 +264,29 @@ byte separator = '.'; int max = Integer.MIN_VALUE / radix; int result = 0, end = start + length; + boolean isExponential = false; + boolean isDecimal = false; while (offset < end) { int digit = LazyUtils.digit(bytes[offset++], radix); if (digit == -1) { + // If we hit a decimal, break. if (bytes[offset-1] == separator) { // We allow decimals and will return a truncated integer in that case. // Therefore we won't throw an exception here (checking the fractional // part happens below.) + isDecimal = true; break; } + // If we hit an exponential notation, break. + if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') { + // Exponential notations are represented as 1e+5, 1e-5, 1e5, etc. + // i.e. whatever follows e or E should be a pure integer with only + // only option sign and decimal digits. + isExponential = true; + break; + } + // If we encountered a character other than '.' or 'e' or 'E', + // the format is invalid. throw new NumberFormatException(LazyUtils.convertToString(bytes, start, length)); } @@ -163,17 +302,46 @@ result = next; } - // This is the case when we've encountered a decimal separator. The fractional - // part will not change the number, but we will verify that the fractional part - // is well formed. - while (offset < end) { - int digit = LazyUtils.digit(bytes[offset++], radix); - if (digit == -1) { + // Process the post exponent part + if (isExponential) { + long exponent = parsePostExponent(bytes, start, end, offset, radix, length); + double RadixPowExp = Math.pow(radix, exponent); + double finalResult = result * RadixPowExp; + if (!negative) { + finalResult = -finalResult; + if (finalResult < 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + } + // Check for overflow and underflow. + if (finalResult > Integer.MAX_VALUE || finalResult < Integer.MIN_VALUE) { throw new NumberFormatException(LazyUtils.convertToString(bytes, start, length)); } + return (int)(finalResult); } + // Process the post decimal part + if (isDecimal) { + BigDecimal value = parsePostDecimal(bytes, start, end, offset, radix, length, result); + if (!negative) { + value = value.negate(); + if (value.signum() == -1) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + } + // Check for overflow and underflow. + if (value.compareTo(BigDecimal.valueOf(Integer.MAX_VALUE)) > 0 || + value.compareTo(BigDecimal.valueOf(Integer.MIN_VALUE)) < 0) { + throw new NumberFormatException(LazyUtils.convertToString(bytes, start, + length)); + } + return value.intValue(); + } + + // No exponent part, no decimal part. if (!negative) { result = -result; if (result < 0) { @@ -187,11 +355,11 @@ /** * Writes out the text representation of an integer using base 10 to an * OutputStream in UTF-8 encoding. - * + * * Note: division by a constant (like 10) is much faster than division by a * variable. That's one of the reasons that we don't make radix a parameter * here. - * + * * @param out * the outputstream to write to * @param i Index: ql/src/test/results/clientpositive/cast_to_int.q.out =================================================================== --- ql/src/test/results/clientpositive/cast_to_int.q.out (revision 1528177) +++ ql/src/test/results/clientpositive/cast_to_int.q.out (working copy) @@ -26,7 +26,14 @@ cast('-128' as tinyint), cast('127' as tinyint), cast('1.0a' as int), - cast('-1.-1' as int) + cast('-1.-1' as int), + cast('1.23e3' as int), + cast('-1.5e2' as int), + cast('1e+9' as int), + cast('1e-2' as int), + cast('1e+10' as int), + cast('1e+10' as bigint), + cast('1e2-34' as int) from src limit 1 PREHOOK: type: QUERY PREHOOK: Input: default@src @@ -59,9 +66,16 @@ cast('-128' as tinyint), cast('127' as tinyint), cast('1.0a' as int), - cast('-1.-1' as int) + cast('-1.-1' as int), + cast('1.23e3' as int), + cast('-1.5e2' as int), + cast('1e+9' as int), + cast('1e-2' as int), + cast('1e+10' as int), + cast('1e+10' as bigint), + cast('1e2-34' as int) from src limit 1 POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -1.0 1.4 1.6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 NULL 2147483647 -2147483648 32767 -32768 -128 127 NULL NULL +1.0 1.4 1.6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 100000 2147483647 -2147483648 32767 -32768 -128 127 NULL NULL 1230 -150 1000000000 0 NULL 10000000000 NULL Index: ql/src/test/queries/clientpositive/cast_to_int.q =================================================================== --- ql/src/test/queries/clientpositive/cast_to_int.q (revision 1528177) +++ ql/src/test/queries/clientpositive/cast_to_int.q (working copy) @@ -26,5 +26,12 @@ cast('-128' as tinyint), cast('127' as tinyint), cast('1.0a' as int), - cast('-1.-1' as int) + cast('-1.-1' as int), + cast('1.23e3' as int), + cast('-1.5e2' as int), + cast('1e+9' as int), + cast('1e-2' as int), + cast('1e+10' as int), + cast('1e+10' as bigint), + cast('1e2-34' as int) from src limit 1;