diff --git a/common/src/java/org/apache/hive/common/util/HiveStringUtils.java b/common/src/java/org/apache/hive/common/util/HiveStringUtils.java index dc3ee98..bf34600 100644 --- a/common/src/java/org/apache/hive/common/util/HiveStringUtils.java +++ b/common/src/java/org/apache/hive/common/util/HiveStringUtils.java @@ -1007,6 +1007,57 @@ public static int getTextUtfLength(Text t) { return len; } + /** + * Checks if b is an ascii character + */ + public static boolean isAscii(byte b) { + return (b & 0x80) == 0; + } + + /** + * Returns the number of leading whitespace characters in the utf-8 string + */ + public static int findLeadingSpaces(byte[] bytes, int start, int length) { + int numSpaces; + for (numSpaces = 0; numSpaces < length; ++numSpaces) { + int curPos = start + numSpaces; + if (isAscii(bytes[curPos]) && Character.isWhitespace(bytes[curPos])) { + continue; + } + break; // non-space character + } + return (numSpaces - start); + } + + /** + * Returns the number of trailing whitespace characters in the utf-8 string + */ + public static int findTrailingSpaces(byte[] bytes, int start, int length) { + int numSpaces; + for (numSpaces = 0; numSpaces < length; ++numSpaces) { + int curPos = start + (length - (numSpaces + 1)); + if (isAscii(bytes[curPos]) && Character.isWhitespace(bytes[curPos])) { + continue; + } else { + break; // non-space character + } + } + return numSpaces; + } + + /** + * Finds trimmed length of utf-8 string + */ + public static int findTrimmedLength(byte[] bytes, int start, int length, int leadingSpaces) { + int trailingSpaces = findTrailingSpaces(bytes, start, length); + length = length - leadingSpaces; + // If string is entirely whitespace, no need to apply trailingSpaces. + if (length > 0) { + length = length - trailingSpaces; + } + return length; + } + public static String normalizeIdentifier(String identifier) { return identifier.trim().toLowerCase(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToByte.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToByte.java index 12f530b..a5c49f3 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToByte.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToByte.java @@ -172,7 +172,7 @@ public ByteWritable evaluate(Text i) { return null; } try { - byteWritable.set(LazyByte.parseByte(i.getBytes(), 0, i.getLength(), 10)); + byteWritable.set(LazyByte.parseByte(i.getBytes(), 0, i.getLength(), 10, true)); return byteWritable; } catch (NumberFormatException e) { // MySQL returns 0 if the string is not a well-formed numeric value. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToInteger.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToInteger.java index 1de7604..df756d0 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToInteger.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToInteger.java @@ -175,7 +175,7 @@ public IntWritable evaluate(Text i) { } try { intWritable.set(LazyInteger - .parseInt(i.getBytes(), 0, i.getLength(), 10)); + .parseInt(i.getBytes(), 0, i.getLength(), 10, true)); return intWritable; } catch (NumberFormatException e) { // MySQL returns 0 if the string is not a well-formed numeric value. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToLong.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToLong.java index 11f6a6c..4821d67 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToLong.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToLong.java @@ -184,7 +184,7 @@ public LongWritable evaluate(Text i) { } try { longWritable - .set(LazyLong.parseLong(i.getBytes(), 0, i.getLength(), 10)); + .set(LazyLong.parseLong(i.getBytes(), 0, i.getLength(), 10, true)); return longWritable; } catch (NumberFormatException e) { // MySQL returns 0 if the string is not a well-formed numeric value. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToShort.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToShort.java index 5372549..df0fc96 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToShort.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFToShort.java @@ -174,7 +174,7 @@ public ShortWritable evaluate(Text i) { } try { shortWritable.set(LazyShort.parseShort(i.getBytes(), 0, i.getLength(), - 10)); + 10, true)); return shortWritable; } catch (NumberFormatException e) { // MySQL returns 0 if the string is not a well-formed numeric value. diff --git a/ql/src/test/queries/clientpositive/cast2.q b/ql/src/test/queries/clientpositive/cast2.q new file mode 100644 index 0000000..12228be --- /dev/null +++ b/ql/src/test/queries/clientpositive/cast2.q @@ -0,0 +1,5 @@ + +select cast('1' as tinyint), cast('1' as smallint), cast('1' as int), cast('1' as bigint), cast('1' as float), cast('1' as double), cast('1' as decimal(10,2)); + +-- Check that leading/trailing space is handled consistently for numeric types +select cast(' 1 ' as tinyint), cast(' 1 ' as smallint), cast(' 1 ' as int), cast(' 1 ' as bigint), cast(' 1 ' as float), cast(' 1 ' as double), cast(' 1 ' as decimal(10,2)); diff --git a/ql/src/test/results/clientpositive/cast2.q.out b/ql/src/test/results/clientpositive/cast2.q.out new file mode 100644 index 0000000..9fa3a9d --- /dev/null +++ b/ql/src/test/results/clientpositive/cast2.q.out @@ -0,0 +1,18 @@ +PREHOOK: query: select cast('1' as tinyint), cast('1' as smallint), cast('1' as int), cast('1' as bigint), cast('1' as float), cast('1' as double), cast('1' as decimal(10,2)) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select cast('1' as tinyint), cast('1' as smallint), cast('1' as int), cast('1' as bigint), cast('1' as float), cast('1' as double), cast('1' as decimal(10,2)) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +1 1 1 1 1.0 1.0 1 +PREHOOK: query: select cast(' 1 ' as tinyint), cast(' 1 ' as smallint), cast(' 1 ' as int), cast(' 1 ' as bigint), cast(' 1 ' as float), cast(' 1 ' as double), cast(' 1 ' as decimal(10,2)) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select cast(' 1 ' as tinyint), cast(' 1 ' as smallint), cast(' 1 ' as int), cast(' 1 ' as bigint), cast(' 1 ' as float), cast(' 1 ' as double), cast(' 1 ' as decimal(10,2)) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +1 1 1 1 1.0 1.0 1 diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java index 1f9cead..405c905 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyByte.java @@ -95,7 +95,29 @@ public static byte parseByte(byte[] bytes, int start, int length) { * if the argument could not be parsed as a byte quantity. */ public static byte parseByte(byte[] bytes, int start, int length, int radix) { - int intValue = LazyInteger.parseInt(bytes, start, length, radix); + return parseByte(bytes, start, length, radix, false); + } + + /** + * Parses the string argument as if it was a byte value and returns the + * result. Throws NumberFormatException if the string does not represent a + * single byte quantity. The second argument specifies the radix to use when + * parsing the value. + * + * @param bytes + * @param start + * @param length + * a UTF-8 encoded string representation of a single byte quantity. + * @param radix + * the radix to use when parsing. + * @param trim + * whether to trim leading/trailing whitespace + * @return byte the value represented by the argument + * @throws NumberFormatException + * if the argument could not be parsed as a byte quantity. + */ + public static byte parseByte(byte[] bytes, int start, int length, int radix, boolean trim) { + int intValue = LazyInteger.parseInt(bytes, start, length, radix, trim); byte result = (byte) intValue; if (result == intValue) { return result; diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java index 22742aa..9687c6d 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyInteger.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyIntObjectInspector; import org.apache.hadoop.io.IntWritable; +import org.apache.hive.common.util.HiveStringUtils; /** * LazyObject for storing a value of Integer. @@ -98,12 +99,42 @@ public static int parseInt(byte[] bytes, int start, int length) { * if the argument could not be parsed as an int quantity. */ public static int parseInt(byte[] bytes, int start, int length, int radix) { + return parseInt(bytes, start, length, radix, false); + } + + /** + * Parses the string argument as if it was an int value and returns the + * result. Throws NumberFormatException if the string does not represent an + * int quantity. The second argument specifies the radix to use when parsing + * the value. + * + * @param bytes + * @param start + * @param length + * a UTF-8 encoded string representation of an int quantity. + * @param radix + * the base to use for conversion. + * @param trim + * whether to trim leading/trailing whitespace + * @return the value represented by the argument + * @exception NumberFormatException + * if the argument could not be parsed as an int quantity. + */ + public static int parseInt(byte[] bytes, int start, int length, int radix, boolean trim) { if (bytes == null) { throw new NumberFormatException("String is null"); } if (radix < Character.MIN_RADIX || radix > Character.MAX_RADIX) { throw new NumberFormatException("Invalid radix: " + radix); } + if (trim) { + // Handle leading/trailing whitespace + int leadingSpaces = HiveStringUtils.findLeadingSpaces(bytes, start, length); + int trailingSpaces = HiveStringUtils.findTrailingSpaces(bytes, start, length); + start = start + leadingSpaces; + // min() needed in the case that entire string is whitespace + length = length - Math.min(length, leadingSpaces + trailingSpaces); + } if (length == 0) { throw new NumberFormatException("Empty string!"); } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java index c0d52b9..91c7e5e 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyLong.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector; import org.apache.hadoop.io.LongWritable; +import org.apache.hive.common.util.HiveStringUtils; /** * LazyObject for storing a value of Long. @@ -98,12 +99,42 @@ public static long parseLong(byte[] bytes, int start, int length) { * if the argument could not be parsed as an long quantity. */ public static long parseLong(byte[] bytes, int start, int length, int radix) { + return parseLong(bytes, start, length, radix, false); + } + + /** + * Parses the string argument as if it was an long value and returns the + * result. Throws NumberFormatException if the string does not represent an + * long quantity. The second argument specifies the radix to use when parsing + * the value. + * + * @param bytes + * @param start + * @param length + * a UTF-8 encoded string representation of a long quantity. + * @param radix + * the base to use for conversion. + * @param trim + * whether to trim leading/trailing whitespace + * @return the value represented by the argument + * @exception NumberFormatException + * if the argument could not be parsed as an long quantity. + */ + public static long parseLong(byte[] bytes, int start, int length, int radix, boolean trim) { if (bytes == null) { throw new NumberFormatException("String is null"); } if (radix < Character.MIN_RADIX || radix > Character.MAX_RADIX) { throw new NumberFormatException("Invalid radix: " + radix); } + if (trim) { + // Handle leading/trailing whitespace + int leadingSpaces = HiveStringUtils.findLeadingSpaces(bytes, start, length); + int trailingSpaces = HiveStringUtils.findTrailingSpaces(bytes, start, length); + start = start + leadingSpaces; + // min() needed in the case that entire string is whitespace + length = length - Math.min(length, leadingSpaces + trailingSpaces); + } if (length == 0) { throw new NumberFormatException("Empty string!"); } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java index b8b9488..39c2b60 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyShort.java @@ -95,7 +95,29 @@ public static short parseShort(byte[] bytes, int start, int length) { * if the argument could not be parsed as a short quantity. */ public static short parseShort(byte[] bytes, int start, int length, int radix) { - int intValue = LazyInteger.parseInt(bytes, start, length, radix); + return parseShort(bytes, start, length, radix, false); + } + + /** + * Parses the string argument as if it was a short value and returns the + * result. Throws NumberFormatException if the string does not represent a + * single short quantity. The second argument specifies the radix to use when + * parsing the value. + * + * @param bytes + * @param start + * @param length + * a UTF-8 encoded string representation of a short quantity. + * @param radix + * the radix to use when parsing. + * @param trim + * whether to trim leading/trailing whitespace + * @return short the value represented by the argument + * @exception NumberFormatException + * if the argument could not be parsed as a short quantity. + */ + public static short parseShort(byte[] bytes, int start, int length, int radix, boolean trim) { + int intValue = LazyInteger.parseInt(bytes, start, length, radix, trim); short result = (short) intValue; if (result == intValue) { return result; diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java index 3d7f11e..663d1f2 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java @@ -468,4 +468,201 @@ public void testLazyLongWrite() throws Throwable { } } + private void testIntCaseWithPass(String strVal, int intVal, boolean trim) { + Text text = new Text(strVal); + assertEquals( + intVal, + LazyInteger.parseInt(text.getBytes(), 0, text.getLength(), 10, trim)); + } + + private void testIntCaseWithFail(String strVal, boolean trim) { + Text text = new Text(strVal); + try { + LazyInteger.parseInt(text.getBytes(), 0, text.getLength(), 10, trim); + fail("Expected to fail while parsing '" + strVal + "'"); + } catch (NumberFormatException err) { + // Error was expected + } + } + + private void testLongCaseWithPass(String strVal, long longVal, boolean trim) { + Text text = new Text(strVal); + assertEquals( + longVal, + LazyLong.parseLong(text.getBytes(), 0, text.getLength(), 10, trim)); + } + + private void testLongCaseWithFail(String strVal, boolean trim) { + Text text = new Text(strVal); + try { + LazyLong.parseLong(text.getBytes(), 0, text.getLength(), 10, trim); + fail("Expected to fail while parsing '" + strVal + "'"); + } catch (NumberFormatException err) { + // Error was expected + } + } + + public void testLazyIntWithSpaces() throws Throwable { + Object[][] casesWithoutSpaces = { + {"0", 0}, + {"-128", -128}, + {"128", 128}, + {"+128", 128}, + {"-2147483648", -2147483648}, + {"2147483647", 2147483647}, + {"+2147483647", 2147483647}, + }; + + Object[][] casesWithSpaces = { + {" 0", 0}, + {"0 ", 0}, + {" 0 ", 0}, + {" -128", -128}, + {"-128 ", -128}, + {" -128 ", -128}, + {" 128", 128}, + {"128 ", 128}, + {" 128 ", 128}, + {" +128", 128}, + {"+128 ", 128}, + {" +128 ", 128}, + {" +128 ", 128}, + {" -2147483648", -2147483648}, + {"-2147483648 ", -2147483648}, + {" -2147483648 ", -2147483648}, + {" 2147483647", 2147483647}, + {"2147483647 ", 2147483647}, + {" 2147483647 ", 2147483647}, + {" +2147483647", 2147483647}, + {"+2147483647 ", 2147483647}, + {" +2147483647 ", 2147483647}, + }; + + String[] casesWithErrors = { + "", + " ", + "one", + " one ", + "123:", + "123a", + " 123a ", + "a123", + " a123 ", + // Exceeds MAX_VALUE + "2147483648", + "-2147483649", + }; + + // + // trim=false + // + boolean trim = false; + for (Object[] testCase : casesWithoutSpaces) { + testIntCaseWithPass((String) testCase[0], ((Number) testCase[1]).intValue(), trim); + } + for (Object[] testCase : casesWithSpaces) { + // With trim=false, parsing cannot handle spaces + testIntCaseWithFail((String) testCase[0], trim); + } + for (String testCase : casesWithErrors) { + testIntCaseWithFail(testCase, trim); + } + + // + // trim=true + // + trim = true; + for (Object[] testCase : casesWithoutSpaces) { + testIntCaseWithPass((String) testCase[0], ((Number) testCase[1]).intValue(), trim); + } + for (Object[] testCase : casesWithSpaces) { + // With trim=true, parsing can handle spaces + testIntCaseWithPass((String) testCase[0], ((Number) testCase[1]).intValue(), trim); + } + for (String testCase : casesWithErrors) { + testIntCaseWithFail(testCase, trim); + } + } + + public void testLazyLongWithSpaces() throws Throwable { + Object[][] casesWithoutSpaces = { + {"0", 0}, + {"-128", -128}, + {"128", 128}, + {"+128", 128}, + {"-9223372036854775808", -9223372036854775808L}, + {"9223372036854775807", 9223372036854775807L}, + {"+9223372036854775807", 9223372036854775807L}, + }; + + Object[][] casesWithSpaces = { + {" 0", 0}, + {"0 ", 0}, + {" 0 ", 0}, + {" -128", -128}, + {"-128 ", -128}, + {" -128 ", -128}, + {" 128", 128}, + {"128 ", 128}, + {" 128 ", 128}, + {" +128", 128}, + {"+128 ", 128}, + {" +128 ", 128}, + {" +128 ", 128}, + {" -9223372036854775808", -9223372036854775808L}, + {"-9223372036854775808 ", -9223372036854775808L}, + {" -9223372036854775808 ", -9223372036854775808L}, + {" 9223372036854775807", 9223372036854775807L}, + {"9223372036854775807 ", 9223372036854775807L}, + {" 9223372036854775807 ", 9223372036854775807L}, + {" +9223372036854775807", 9223372036854775807L}, + {"+9223372036854775807 ", 9223372036854775807L}, + {" +9223372036854775807 ", 9223372036854775807L}, + }; + + String[] casesWithErrors = { + "", + " ", + "one", + " one ", + "123:", + "123a", + " 123a ", + "a123", + " a123 ", + // Exceeds max value + "9223372036854775808", + "9223372036854775809", + }; + + // + // trim=false + // + boolean trim = false; + for (Object[] testCase : casesWithoutSpaces) { + testLongCaseWithPass((String) testCase[0], ((Number) testCase[1]).longValue(), trim); + } + for (Object[] testCase : casesWithSpaces) { + // With trim=false, parsing cannot handle spaces + testLongCaseWithFail((String) testCase[0], trim); + } + for (String testCase : casesWithErrors) { + testLongCaseWithFail(testCase, trim); + } + + // + // trim=true + // + trim = true; + for (Object[] testCase : casesWithoutSpaces) { + testLongCaseWithPass((String) testCase[0], ((Number) testCase[1]).longValue(), trim); + } + for (Object[] testCase : casesWithSpaces) { + // With trim=true, parsing can handle spaces + testLongCaseWithPass((String) testCase[0], ((Number) testCase[1]).longValue(), trim); + } + for (String testCase : casesWithErrors) { + testLongCaseWithFail(testCase, trim); + } + } }