Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java
===================================================================
--- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java (revision 1528177)
+++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java (working copy)
@@ -19,22 +19,23 @@
import java.io.IOException;
import java.io.OutputStream;
+import java.math.BigDecimal;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector;
import org.apache.hadoop.io.LongWritable;
/**
* LazyObject for storing a value of Long.
- *
+ *
*
* Part of the code is adapted from Apache Harmony Project.
- *
+ *
* As with the specification, this implementation relied on code laid out in Henry S. Warren, Jr.'s Hacker's
* Delight, (Addison Wesley, 2002) as well as The Aggregate's Magic Algorithms.
*
- *
+ *
*/
public class LazyLong extends
LazyPrimitive {
@@ -64,7 +65,7 @@
* Parses the string argument as if it was a long value and returns the
* result. Throws NumberFormatException if the string does not represent a
* long quantity.
- *
+ *
* @param bytes
* @param start
* @param length
@@ -82,7 +83,7 @@
* result. Throws NumberFormatException if the string does not represent an
* long quantity. The second argument specifies the radix to use when parsing
* the value.
- *
+ *
* @param bytes
* @param start
* @param length
@@ -117,11 +118,135 @@
}
/**
+ * Function to check if the part post exponent is represented correctly.
+ * For now, we will allow only exponents between Long.MIN_VALUE and
+ * Long.MAX_VALUE.
+ * Eg. If the number is 1E+5, the fn returns 5.
+ * If the number is 1E-5, the fn returns -5.
+ * @param bytes
+ * byte representation of the original number
+ * @param start
+ * start of the original number
+ * @param end
+ * end position of the original number.
+ * @param offset
+ * the starting position after the exponent symbol (if exists)
+ * @param radix
+ * the base to use for conversion.
+ * @param length
+ * a UTF-8 encoded string representation of an long quantity for the original number.
+ * @return the integer value post exponent symbol.
+ * @exception NumberFormatException
+ * if the argument could not be parsed as an long quantity.
+ */
+ private static long parsePostExponent(byte[] bytes, int start, int end, int offset,
+ int radix, int length) {
+ boolean negative = bytes[offset] == '-';
+ long max = Long.MIN_VALUE / radix;
+ long result = 0;
+
+ if (negative || bytes[offset] == '+') {
+ offset++;
+ }
+
+ while (offset < end) {
+ int digit = LazyUtils.digit(bytes[offset++], radix);
+ if (digit == -1) {
+ // If we encountered a character other than '0'-'9'
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ // Overflow
+ if (max > result) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ long next = result * radix - digit;
+ // Underflow
+ if (next > result) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ result = next;
+ }
+ if (!negative) {
+ result = -result;
+ if (result < 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Function to check if the fractional part post decimal is represented correctly.
+ * The value returned is the resultSoFar when the fractional part contain only digits.
+ * The value returned is the evaluated result when fractional part has exponential notation.
+ * For e.g. 1. if number = 1.2e+5, the value returned will be 120000
+ * 2. if number = 1.2e-5, the value returned will be 0 (0.000012)
+ * 3. if number = 1.2345 the value returned will be 1
+ * @param bytes
+ * byte representation of the original number
+ * @param start
+ * start of the original number
+ * @param end
+ * end position of the original number.
+ * @param offset
+ * the starting position after the exponent symbol (if exists)
+ * @param radix
+ * the base to use for conversion.
+ * @param length
+ * a UTF-8 encoded string representation of an int quantity for the original number.
+ * @param resultSoFar
+ * result of the integer part.
+ * @return the integer value post exponent symbol.
+ * @exception NumberFormatException
+ * if the argument could not be parsed as an int quantity.
+ */
+ private static BigDecimal parsePostDecimal(byte[] bytes, int start, int end, int offset,
+ int radix, int length, long resultSoFar) {
+ double result = Math.abs(resultSoFar);
+ int pos = 0;
+
+ // This is the case when we've encountered a decimal separator. The fractional
+ // part will not change the number unless we encounter a exponential notation,
+ // but we will verify that the fractional part is well formed.
+ while (offset < end) {
+ int digit = LazyUtils.digit(bytes[offset++], radix);
+ // Process the remaining part if we hit an exponential notation symbol
+ if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') {
+ long exponent = parsePostExponent(bytes, start, end, offset, radix, length);
+ // Don't allow numbers to end with exponent symbol
+ if (exponent == 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ double RadixPowExp = Math.pow(radix, exponent);
+ double finalResult = result * RadixPowExp * (resultSoFar <= 0 ? -1 : 1);
+ return BigDecimal.valueOf(finalResult);
+ }
+ // Invalid format!
+ if (digit == -1) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ // result is stored as a double till the final conversion to prevent loss-less transformation
+ // For example, 1.3e+4 should return 13000
+ result += (digit*Math.pow(radix, -1*(++pos)));
+ }
+
+ // If we reached here, it means the fractional part is formatted correctly and contains only
+ // digits from '0'-'9'.
+ return BigDecimal.valueOf(resultSoFar);
+ }
+
+ /**
* /** Parses the string argument as if it was an long value and returns the
* result. Throws NumberFormatException if the string does not represent an
* long quantity. The second argument specifies the radix to use when parsing
* the value.
- *
+ *
* @param bytes
* @param start
* @param length
@@ -140,16 +265,29 @@
int radix, boolean negative) {
byte separator = '.';
long max = Long.MIN_VALUE / radix;
- long result = 0, end = start + length;
+ long result = 0;
+ int end = start + length;
+ boolean isExponential = false;
+ boolean isDecimal = false;
while (offset < end) {
int digit = LazyUtils.digit(bytes[offset++], radix);
if (digit == -1 || max > result) {
+ // If we hit decimal separator, break.
if (bytes[offset-1] == separator) {
// We allow decimals and will return a truncated integer in that case.
// Therefore we won't throw an exception here (checking the fractional
// part happens below.)
+ isDecimal = true;
break;
}
+ // If we hit an exponential notation, break.
+ if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') {
+ // Exponential notations are represented as 1e+5, 1e-5, 1e5, etc.
+ // i.e. whatever follows e or E should be a pure integer with only
+ // only option sign and decimal digits.
+ isExponential = true;
+ break;
+ }
throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
length));
}
@@ -161,17 +299,46 @@
result = next;
}
- // This is the case when we've encountered a decimal separator. The fractional
- // part will not change the number, but we will verify that the fractional part
- // is well formed.
- while (offset < end) {
- int digit = LazyUtils.digit(bytes[offset++], radix);
- if (digit == -1) {
+ // Process the post exponent part
+ if (isExponential) {
+ long exponent = parsePostExponent(bytes, start, end, offset, radix, length);
+ double RadixPowExp = Math.pow(radix, exponent);
+ double finalResult = result * RadixPowExp;
+ if (!negative) {
+ finalResult = -finalResult;
+ if (finalResult < 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ }
+ // Check for overflow and underflow.
+ if (finalResult > Long.MAX_VALUE || finalResult < Long.MIN_VALUE) {
throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
length));
}
+ return (long)(finalResult);
}
+ // Process the post decimal part
+ if (isDecimal) {
+ BigDecimal value = parsePostDecimal(bytes, start, end, offset, radix, length, result);
+ if (!negative) {
+ value = value.negate();
+ if (value.signum() == -1) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ }
+ // Check for overflow and underflow.
+ if (value.compareTo(BigDecimal.valueOf(Long.MAX_VALUE)) > 0 ||
+ value.compareTo(BigDecimal.valueOf(Long.MIN_VALUE)) < 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ return value.longValue();
+ }
+
+ // No exponent part, no decimal part.
if (!negative) {
result = -result;
if (result < 0) {
@@ -185,11 +352,11 @@
/**
* Writes out the text representation of an integer using base 10 to an
* OutputStream in UTF-8 encoding.
- *
+ *
* Note: division by a constant (like 10) is much faster than division by a
* variable. That's one of the reasons that we don't make radix a parameter
* here.
- *
+ *
* @param out
* the outputstream to write to
* @param i
Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java
===================================================================
--- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java (revision 1528177)
+++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java (working copy)
@@ -19,22 +19,23 @@
import java.io.IOException;
import java.io.OutputStream;
+import java.math.BigDecimal;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyIntObjectInspector;
import org.apache.hadoop.io.IntWritable;
/**
* LazyObject for storing a value of Integer.
- *
+ *
*
* Part of the code is adapted from Apache Harmony Project.
- *
+ *
* As with the specification, this implementation relied on code laid out in Henry S. Warren, Jr.'s Hacker's
* Delight, (Addison Wesley, 2002) as well as The Aggregate's Magic Algorithms.
*
- *
+ *
*/
public class LazyInteger extends
LazyPrimitive {
@@ -64,7 +65,7 @@
* Parses the string argument as if it was an int value and returns the
* result. Throws NumberFormatException if the string does not represent an
* int quantity.
- *
+ *
* @param bytes
* @param start
* @param length
@@ -82,7 +83,7 @@
* result. Throws NumberFormatException if the string does not represent an
* int quantity. The second argument specifies the radix to use when parsing
* the value.
- *
+ *
* @param bytes
* @param start
* @param length
@@ -117,10 +118,134 @@
}
/**
- *
+ * Function to check if the part post exponent is represented correctly.
+ * For now, we will allow only exponents between Integer.MIN_VALUE and
+ * Integer.MAX_VALUE.
+ * Eg. If the number is 1E+5, the fn returns 5.
+ * If the number is 1E-5, the fn returns -5.
* @param bytes
+ * byte representation of the original number
* @param start
+ * start of the original number
+ * @param end
+ * end position of the original number.
+ * @param offset
+ * the starting position after the exponent symbol (if exists)
+ * @param radix
+ * the base to use for conversion.
* @param length
+ * a UTF-8 encoded string representation of an int quantity for the original number.
+ * @return the integer value post exponent symbol.
+ * @exception NumberFormatException
+ * if the argument could not be parsed as an int quantity.
+ */
+ private static int parsePostExponent(byte[] bytes, int start, int end, int offset,
+ int radix, int length) {
+ boolean negative = bytes[offset] == '-';
+ int max = Integer.MIN_VALUE / radix;
+ int result = 0;
+
+ if (negative || bytes[offset] == '+') {
+ offset++;
+ }
+
+ while (offset < end) {
+ int digit = LazyUtils.digit(bytes[offset++], radix);
+ if (digit == -1) {
+ // If we encountered a character other than '0'-'9'
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ // Overflow
+ if (max > result) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ int next = result * radix - digit;
+ // Underflow
+ if (next > result) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ result = next;
+ }
+ if (!negative) {
+ result = -result;
+ if (result < 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Function to check if the fractional part post decimal is represented correctly.
+ * The value returned is the resultSoFar when the fractional part contain only digits.
+ * The value returned is the evaluated result when fractional part has exponential notation.
+ * For e.g. 1. if number = 1.2e+5, the value returned will be 120000
+ * 2. if number = 1.2e-5, the value returned will be 0 (0.000012)
+ * 3. if number = 1.2345 the value returned will be 1
+ * @param bytes
+ * byte representation of the original number
+ * @param start
+ * start of the original number
+ * @param end
+ * end position of the original number.
+ * @param offset
+ * the starting position after the exponent symbol (if exists)
+ * @param radix
+ * the base to use for conversion.
+ * @param length
+ * a UTF-8 encoded string representation of an int quantity for the original number.
+ * @param resultSoFar
+ * result of the non-decimal part.
+ * @return the integer value post exponent symbol.
+ * @exception NumberFormatException
+ * if the argument could not be parsed as an int quantity.
+ */
+ private static BigDecimal parsePostDecimal(byte[] bytes, int start, int end, int offset,
+ int radix, int length, int resultSoFar) {
+ double result = Math.abs(resultSoFar);
+ int pos = 0;
+
+ // This is the case when we've encountered a decimal separator. The fractional
+ // part will not change the number unless we encounter a exponential notation,
+ // but we will verify that the fractional part is well formed.
+ while (offset < end) {
+ int digit = LazyUtils.digit(bytes[offset++], radix);
+ // Process the remaining part if we hit an exponential notation symbol
+ if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') {
+ int exponent = parsePostExponent(bytes, start, end, offset, radix, length);
+ // Don't allow numbers to end with exponent symbol
+ if (exponent == 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ double RadixPowExp = Math.pow(radix, exponent);
+ double finalResult = result * RadixPowExp * (resultSoFar <= 0 ? -1 : 1);
+ return BigDecimal.valueOf(finalResult);
+ }
+ // Invalid format!
+ if (digit == -1) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ // result is stored as a double till the final conversion to prevent loss-less transformation
+ // For example, 1.3e+4 should return 13000
+ result += (digit*Math.pow(radix, -1*(++pos)));
+ }
+
+ // If we reached here, it means the fractional part is formatted correctly and contains only
+ // digits from '0'-'9'.
+ return BigDecimal.valueOf(resultSoFar);
+ }
+
+ /**
+ *
+ * @param bytes
+ * @param start
+ * @param length
* a UTF-8 encoded string representation of an int quantity.
* @param radix
* the base to use for conversion.
@@ -139,15 +264,29 @@
byte separator = '.';
int max = Integer.MIN_VALUE / radix;
int result = 0, end = start + length;
+ boolean isExponential = false;
+ boolean isDecimal = false;
while (offset < end) {
int digit = LazyUtils.digit(bytes[offset++], radix);
if (digit == -1) {
+ // If we hit a decimal, break.
if (bytes[offset-1] == separator) {
// We allow decimals and will return a truncated integer in that case.
// Therefore we won't throw an exception here (checking the fractional
// part happens below.)
+ isDecimal = true;
break;
}
+ // If we hit an exponential notation, break.
+ if (bytes[offset-1] == 'e' || bytes[offset-1] == 'E') {
+ // Exponential notations are represented as 1e+5, 1e-5, 1e5, etc.
+ // i.e. whatever follows e or E should be a pure integer with only
+ // only option sign and decimal digits.
+ isExponential = true;
+ break;
+ }
+ // If we encountered a character other than '.' or 'e' or 'E',
+ // the format is invalid.
throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
length));
}
@@ -163,17 +302,46 @@
result = next;
}
- // This is the case when we've encountered a decimal separator. The fractional
- // part will not change the number, but we will verify that the fractional part
- // is well formed.
- while (offset < end) {
- int digit = LazyUtils.digit(bytes[offset++], radix);
- if (digit == -1) {
+ // Process the post exponent part
+ if (isExponential) {
+ long exponent = parsePostExponent(bytes, start, end, offset, radix, length);
+ double RadixPowExp = Math.pow(radix, exponent);
+ double finalResult = result * RadixPowExp;
+ if (!negative) {
+ finalResult = -finalResult;
+ if (finalResult < 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ }
+ // Check for overflow and underflow.
+ if (finalResult > Integer.MAX_VALUE || finalResult < Integer.MIN_VALUE) {
throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
length));
}
+ return (int)(finalResult);
}
+ // Process the post decimal part
+ if (isDecimal) {
+ BigDecimal value = parsePostDecimal(bytes, start, end, offset, radix, length, result);
+ if (!negative) {
+ value = value.negate();
+ if (value.signum() == -1) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ }
+ // Check for overflow and underflow.
+ if (value.compareTo(BigDecimal.valueOf(Integer.MAX_VALUE)) > 0 ||
+ value.compareTo(BigDecimal.valueOf(Integer.MIN_VALUE)) < 0) {
+ throw new NumberFormatException(LazyUtils.convertToString(bytes, start,
+ length));
+ }
+ return value.intValue();
+ }
+
+ // No exponent part, no decimal part.
if (!negative) {
result = -result;
if (result < 0) {
@@ -187,11 +355,11 @@
/**
* Writes out the text representation of an integer using base 10 to an
* OutputStream in UTF-8 encoding.
- *
+ *
* Note: division by a constant (like 10) is much faster than division by a
* variable. That's one of the reasons that we don't make radix a parameter
* here.
- *
+ *
* @param out
* the outputstream to write to
* @param i
Index: ql/src/test/results/clientpositive/cast_to_int.q.out
===================================================================
--- ql/src/test/results/clientpositive/cast_to_int.q.out (revision 1528177)
+++ ql/src/test/results/clientpositive/cast_to_int.q.out (working copy)
@@ -26,7 +26,14 @@
cast('-128' as tinyint),
cast('127' as tinyint),
cast('1.0a' as int),
- cast('-1.-1' as int)
+ cast('-1.-1' as int),
+ cast('1.23e3' as int),
+ cast('-1.5e2' as int),
+ cast('1e+9' as int),
+ cast('1e-2' as int),
+ cast('1e+10' as int),
+ cast('1e+10' as bigint),
+ cast('1e2-34' as int)
from src limit 1
PREHOOK: type: QUERY
PREHOOK: Input: default@src
@@ -59,9 +66,16 @@
cast('-128' as tinyint),
cast('127' as tinyint),
cast('1.0a' as int),
- cast('-1.-1' as int)
+ cast('-1.-1' as int),
+ cast('1.23e3' as int),
+ cast('-1.5e2' as int),
+ cast('1e+9' as int),
+ cast('1e-2' as int),
+ cast('1e+10' as int),
+ cast('1e+10' as bigint),
+ cast('1e2-34' as int)
from src limit 1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
-1.0 1.4 1.6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 NULL 2147483647 -2147483648 32767 -32768 -128 127 NULL NULL
+1.0 1.4 1.6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 100000 2147483647 -2147483648 32767 -32768 -128 127 NULL NULL 1230 -150 1000000000 0 NULL 10000000000 NULL
Index: ql/src/test/queries/clientpositive/cast_to_int.q
===================================================================
--- ql/src/test/queries/clientpositive/cast_to_int.q (revision 1528177)
+++ ql/src/test/queries/clientpositive/cast_to_int.q (working copy)
@@ -26,5 +26,12 @@
cast('-128' as tinyint),
cast('127' as tinyint),
cast('1.0a' as int),
- cast('-1.-1' as int)
+ cast('-1.-1' as int),
+ cast('1.23e3' as int),
+ cast('-1.5e2' as int),
+ cast('1e+9' as int),
+ cast('1e-2' as int),
+ cast('1e+10' as int),
+ cast('1e+10' as bigint),
+ cast('1e2-34' as int)
from src limit 1;