diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveDateTimeFormatter.java common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveDateTimeFormatter.java new file mode 100644 index 0000000000..a158d4befd --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveDateTimeFormatter.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.text.SimpleDateFormat; +import java.time.format.DateTimeFormatter; +import java.util.TimeZone; + +/** + * Interface used for formatting and parsing timestamps. Initially created so that user is able to + * optionally format datetime objects into strings and parse strings into datetime objects with + * SQL:2016 semantics, as well as with the legacy (java.text.SimpleDateFormat) format. + */ +public interface HiveDateTimeFormatter { + /** + * Format the given timestamp into a string. + * + * @throws IllegalArgumentException if timestamp cannot be formatted. + */ + String format(Timestamp ts); + + /** + * Format the given date into a string. + * + * @throws IllegalArgumentException if date cannot be formatted. + */ + String format(Date date); + + /** + * Parse the given string into a timestamp. + * + * @throws IllegalArgumentException if string cannot be parsed. + */ + Timestamp parseTimestamp(String string); + + /** + * Parse the given string into a timestamp. + * + * @throws IllegalArgumentException if string cannot be parsed. + */ + Date parseDate(String string); + + /** + * Get the format pattern to be used for formatting datetime objects or parsing strings. + */ + String getPattern(); +} diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveJavaDateTimeFormatter.java common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveJavaDateTimeFormatter.java new file mode 100644 index 0000000000..409a902e65 --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveJavaDateTimeFormatter.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; + +/** + * Wrapper for DateTimeFormatter in the java.time package. + */ +public class HiveJavaDateTimeFormatter implements HiveDateTimeFormatter { + + private DateTimeFormatter formatter; + + public HiveJavaDateTimeFormatter(DateTimeFormatter formatter) { + this.formatter = formatter; + } + + @Override public String format(Timestamp ts) { + return formatter.format( + LocalDateTime.ofInstant( + Instant.ofEpochSecond(ts.toEpochSecond(), ts.getNanos()), ZoneId.of("UTC"))); + } + + @Override public String format(Date date) { + return format(Timestamp.ofEpochMilli(date.toEpochMilli())); + } + + @Override public Timestamp parseTimestamp(String string) { + LocalDateTime ldt = LocalDateTime.parse(string, formatter); + return Timestamp.ofEpochSecond(ldt.toEpochSecond(ZoneOffset.UTC), ldt.getNano()); + } + + @Override public Date parseDate(String string) { + return Date.ofEpochMilli(parseTimestamp(string).toEpochMilli()); + } + + @Override public String getPattern() { + return formatter.toString(); + } +} diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSimpleDateFormatter.java common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSimpleDateFormatter.java new file mode 100644 index 0000000000..faec4891d1 --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSimpleDateFormatter.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.text.SimpleDateFormat; +import java.time.format.DateTimeFormatter; +import java.util.Date; +import java.util.TimeZone; + +/** + * Wrapper for java.text.SimpleDateFormat. + */ +public class HiveSimpleDateFormatter implements HiveDateTimeFormatter { + + private SimpleDateFormat format = new SimpleDateFormat(); + private String pattern; + + public HiveSimpleDateFormatter(String pattern, TimeZone timeZone) { + setPattern(pattern); + format.setTimeZone(timeZone); + } + + @Override public String format(Timestamp ts) { + Date date = new Date(ts.toEpochMilli()); + return format.format(date); + } + + @Override public String format(org.apache.hadoop.hive.common.type.Date d) { + Date date = new Date(d.toEpochMilli()); + return format.format(date); + } + + @Override public Timestamp parseTimestamp(String string) { + try { + Date date = format.parse(string); + return Timestamp.ofEpochMilli(date.getTime()); + } catch (java.text.ParseException e) { + throw new IllegalArgumentException( + "String " + string + " could not be parsed by java.text.SimpleDateFormat: " + format); + } + } + + @Override public org.apache.hadoop.hive.common.type.Date parseDate(String string) { + return org.apache.hadoop.hive.common.type.Date.ofEpochMilli( + parseTimestamp(string).toEpochMilli()); + } + + private void setPattern(String pattern) { + format.applyPattern(pattern); + this.pattern = pattern; + } + + @Override public String getPattern() { + return pattern; + } +} diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java new file mode 100644 index 0000000000..601e38bfd5 --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java @@ -0,0 +1,876 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.time.DateTimeException; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoField; +import java.time.temporal.ChronoUnit; +import java.time.temporal.TemporalField; +import java.time.temporal.TemporalUnit; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; + +/** + * Formatter using SQL:2016 datetime patterns. + * + * For all tokens: + * - Patterns are case-insensitive, except AM/PM and T/Z. See these sections for more details. + * - For string to datetime conversion, no duplicate format tokens are allowed, including tokens + * that have the same meaning but different lengths ("Y" and "YY" conflict) or different + * behaviors ("RR" and "YY" conflict). + * + * For all numeric tokens: + * - The "expected length" of input/output is the number of tokens in the character (e.g. "YYY": 3, + * "Y": 1, and so on), with some exceptions (see map SPECIAL_LENGTHS). + * - For string to datetime conversion, inputs of fewer digits than expected are accepted if + * followed by a delimiter, e.g. format="YYYY-MM-DD", input="19-1-1", output=2019-01-01 00:00:00. + * - For datetime to string conversion, output is left padded with zeros, e.g. format="DD SSSSS", + * input=2019-01-01 00:00:03, output="01 00003". + * + * + * Accepted format tokens: + * Note: "|" means "or". "Delimiter" means a separator, tokens T or Z, or end of input. + * + * A. Temporal tokens + * YYYY + * 4-digit year + * - For string to datetime conversion, prefix digits for 1, 2, and 3-digit inputs are obtained + * from current date + * E.g. input=‘9-01-01’, pattern =‘YYYY-MM-DD’, current year=2020, output=2029-01-01 00:00:00 + * + * + * YYY + * Last 3 digits of a year + * - Gets the prefix digit from current date. + * - Can accept fewer digits than 3, similarly to YYYY. + * + * YY + * Last 2 digits of a year + * - Gets the 2 prefix digits from current date. + * - Can accept fewer digits than 2, similarly to YYYY. + * + * Y + * Last digit of a year + * - Gets the 3 prefix digits from current date. + * + * RRRR + * 4-digit rounded year + * - String to datetime conversion: + * - If 2 digits are provided then acts like RR. + * - If 1,3 or 4 digits provided then acts like YYYY. + * - For datetime to string conversion, acts like YYYY. + * + * RR + * 2-digit rounded year + * -String to datetime conversion: + * - Semantics: + * Input: Last 2 digits of current year: First 2 digits of output: + * 0 to 49 00 to 49 First 2 digits of current year + * 0 to 49 50 to 99 First 2 digits of current year + 1 + * 50 to 99 00 to 49 First 2 digits of current year - 1 + * 50 to 99 50 to 99 First 2 digits of current year + * - If 1-digit year is provided followed by a delimiter, falls back to YYYY with 1-digit year + * input. + * - For datetime to string conversion, acts like YY. + * + * MM + * Month (1-12) + * - For string to datetime conversion, conflicts with DDD. + * + * DD + * Day of month (1-31) + * - For string to datetime conversion, conflicts with DDD. + * + * DDD + * Day of year (1-366) + * - For string to datetime conversion, conflicts with DD and MM. + * + * HH + * Hour of day (1-12) + * - If no AM/PM provided then defaults to AM. + * - In string to datetime conversion, conflicts with SSSSS and HH24. + * + * HH12 + * Hour of day (1-12) + * See HH. + * + * HH24 + * Hour of day (0-23) + * - In string to datetime conversion, conflicts with SSSSS, HH12 and AM/PM. + * + * MI + * Minute of hour (0-59) + * - In string to datetime conversion, conflicts with SSSSS. + * + * SS + * Second of minute (0-59) + * - In string to datetime conversion, conflicts with SSSSS. + * + * SSSSS + * Second of Day (0-86399) + * - In string to datetime conversion, conflicts with SS, HH, HH12, HH24, MI, AM/PM. + * + * FF[1..9] + * Fraction of second + * - 1..9 indicates the number of decimal digits. "FF" (no number of digits specified) is also + * accepted. + * - In datetime to string conversion, "FF" will omit trailing zeros, or output "0" if subsecond + * value is 0. + * - In string to datetime conversion, fewer digits than expected are accepted if followed by a + * delimiter. "FF" acts like "FF9". + * + * AM|A.M. + * Meridiem indicator or AM/PM + * - Datetime to string conversion: + * - AM and PM mean the exact same thing in the pattern. + * e.g. input=2019-01-01 20:00, format=“AM”, output=“PM”. + * - Retains the exact format (capitalization and length) provided in the pattern string. If p.m. + * is in the pattern, we expect a.m. or p.m. in the output; if AM is in the pattern, we expect + * AM or PM in the output. + * - String to datetime conversion: + * - Conflicts with HH24 and SSSSS. + * - It doesn’t matter which meridian indicator is in the pattern. + * E.g. input="2019-01-01 11:00 p.m.", pattern="YYYY-MM-DD HH12:MI AM", + * output=2019-01-01 23:00:00 + * + * PM|P.M. + * Meridiem indicator + * See AM|A.M. + * + * B. Time zone tokens + * TZH + * Time zone offset hour (-15 to +15) + * - 3-character-long input is expected: 1 character for the sign and 2 digits for the value. + * e.g. “+10”, “-05” + * - 2-digit input is accepted without the sign, e.g. “04”. + * - Both these 2 and 3-digit versions are accepted even if not followed by separators. + * - Disabled for timestamp to string and date to string conversion, as timestamp and date are time + * zone agnostic. + * + * TZM + * Time zone offset minute (0-59) + * - For string to datetime conversion: + * - TZH token is required. + * - Unsigned; sign comes from TZH. + * - Therefore time zone offsets like “-30” minutes should be expressed thus: input=“-00:30” + * pattern=“TZH:TZM”. + * - Disabled for timestamp to string and date to string conversion, as timestamp and date are time + * zone agnostic. + * + * C. Separators + * -|.|/|,|'|;|:| + * Separator + * - Uses loose matching. Existence of a sequence of separators in the format should match the + * existence of a sequence of separators in the input regardless of the types of the separator or + * the length of the sequence where length > 1. E.g. input=“2019-. ;10/10”, pattern=“YYYY-MM-DD” + * is valid; input=“20191010”, pattern=“YYYY-MM-DD” is not valid. + * - If the last separator character in the separator substring is "-" and is immediately followed + * by a time zone hour (tzh) token, it's a negative sign and not counted as a separator, UNLESS + * this is the only possible separator character in the separator substring (in which case it is + * not counted as the tzh's negative sign). + * + * D. ISO 8601 delimiters + * T + * ISO 8601 delimiter + * - Serves as a delimiter. + * - Function is to support formats like “YYYY-MM-DDTHH24:MI:SS.FF9Z”, “YYYY-MM-DD-HH24:MI:SSZ” + * - For datetime to string conversion, output is always capitalized ("T"), even if lowercase ("t") + * is provided in the pattern. + * + * Z + * ISO 8601 delimiter + * See T. + */ + +public class HiveSqlDateTimeFormatter implements HiveDateTimeFormatter { + + private static final int LONGEST_TOKEN_LENGTH = 5; + private static final int LONGEST_ACCEPTED_PATTERN = 100; // for sanity's sake + private static final long MINUTES_PER_HOUR = 60; + private static final int FIFTY = 50; + private static final int NANOS_MAX_LENGTH = 9; + public static final int AM = 0; + public static final int PM = 1; + private String pattern; + private List tokens = new ArrayList<>(); + + private static final Map TEMPORAL_TOKENS = + ImmutableMap.builder() + .put("yyyy", ChronoField.YEAR).put("yyy", ChronoField.YEAR) + .put("yy", ChronoField.YEAR).put("y", ChronoField.YEAR) + .put("rrrr", ChronoField.YEAR).put("rr", ChronoField.YEAR) + .put("mm", ChronoField.MONTH_OF_YEAR) + .put("dd", ChronoField.DAY_OF_MONTH) + .put("ddd", ChronoField.DAY_OF_YEAR) + .put("hh", ChronoField.HOUR_OF_AMPM) + .put("hh12", ChronoField.HOUR_OF_AMPM) + .put("hh24", ChronoField.HOUR_OF_DAY) + .put("mi", ChronoField.MINUTE_OF_HOUR) + .put("ss", ChronoField.SECOND_OF_MINUTE) + .put("sssss", ChronoField.SECOND_OF_DAY) + .put("ff1", ChronoField.NANO_OF_SECOND).put("ff2", ChronoField.NANO_OF_SECOND) + .put("ff3", ChronoField.NANO_OF_SECOND).put("ff4", ChronoField.NANO_OF_SECOND) + .put("ff5", ChronoField.NANO_OF_SECOND).put("ff6", ChronoField.NANO_OF_SECOND) + .put("ff7", ChronoField.NANO_OF_SECOND).put("ff8", ChronoField.NANO_OF_SECOND) + .put("ff9", ChronoField.NANO_OF_SECOND).put("ff", ChronoField.NANO_OF_SECOND) + .put("a.m.", ChronoField.AMPM_OF_DAY).put("am", ChronoField.AMPM_OF_DAY) + .put("p.m.", ChronoField.AMPM_OF_DAY).put("pm", ChronoField.AMPM_OF_DAY) + .build(); + + private static final Map TIME_ZONE_TOKENS = + ImmutableMap.builder() + .put("tzh", ChronoUnit.HOURS).put("tzm", ChronoUnit.MINUTES).build(); + + private static final List VALID_ISO_8601_DELIMITERS = + ImmutableList.of("t", "z"); + + private static final List VALID_SEPARATORS = + ImmutableList.of("-", ":", " ", ".", "/", ";", "\'", ","); + + private static final Map SPECIAL_LENGTHS = ImmutableMap.builder() + .put("hh12", 2).put("hh24", 2).put("tzm", 2).put("am", 4).put("pm", 4) + .put("ff1", 1).put("ff2", 2).put("ff3", 3).put("ff4", 4).put("ff5", 5) + .put("ff6", 6).put("ff7", 7).put("ff8", 8).put("ff9", 9).put("ff", 9) + .build(); + + /** + * Represents broad categories of tokens. + */ + public enum TokenType { + TEMPORAL, + SEPARATOR, + TIMEZONE, + ISO_8601_DELIMITER + } + + /** + * Token representation. + */ + public static class Token { + TokenType type; + TemporalField temporalField; // for type TEMPORAL e.g. ChronoField.YEAR + TemporalUnit temporalUnit; // for type TIMEZONE e.g. ChronoUnit.HOURS + String string; // pattern string, e.g. "yyy" + int length; // length (e.g. YYY: 3, FF8: 8) + + public Token(TemporalField temporalField, String string, int length) { + this(TokenType.TEMPORAL, temporalField, null, string, length); + } + + public Token(TemporalUnit temporalUnit, String string, int length) { + this(TokenType.TIMEZONE, null, temporalUnit, string, length); + } + + public Token(TokenType tokenType, String string) { + this(tokenType, null, null, string, string.length()); + } + + public Token(TokenType tokenType, TemporalField temporalField, TemporalUnit temporalUnit, + String string, int length) { + this.type = tokenType; + this.temporalField = temporalField; + this.temporalUnit = temporalUnit; + this.string = string; + this.length = length; + } + + @Override public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(string); + sb.append(" type: "); + sb.append(type); + if (temporalField != null) { + sb.append(" temporalField: "); + sb.append(temporalField); + } else if (temporalUnit != null) { + sb.append(" temporalUnit: "); + sb.append(temporalUnit); + } + return sb.toString(); + } + } + + public HiveSqlDateTimeFormatter(String pattern, boolean forParsing) { + setPattern(pattern, forParsing); + } + + /** + * Parse and perhaps verify the pattern. + */ + private void setPattern(String pattern, boolean forParsing) { + assert pattern.length() < LONGEST_ACCEPTED_PATTERN : "The input format is too long"; + this.pattern = pattern; + + parsePatternToTokens(pattern); + + // throw IllegalArgumentException if pattern is invalid + if (forParsing) { + verifyForParse(); + } else { + verifyForFormat(); + } + } + + /** + * Parse pattern to list of tokens. + */ + private String parsePatternToTokens(String pattern) { + tokens.clear(); + String originalPattern = pattern; + pattern = pattern.toLowerCase(); + + // indexes of the substring we will check (includes begin, does not include end) + int begin=0, end=0; + String candidate; + Token lastAddedToken = null; + + while (begin < pattern.length()) { + // if begin hasn't progressed, then pattern is not parsable + if (begin != end) { + tokens.clear(); + throw new IllegalArgumentException("Bad date/time conversion pattern: " + pattern); + } + + // find next token + for (int i = LONGEST_TOKEN_LENGTH; i > 0; i--) { + end = begin + i; + if (end > pattern.length()) { // don't go past the end of the pattern string + continue; + } + candidate = pattern.substring(begin, end); + if (isSeparator(candidate)) { + lastAddedToken = parseSeparatorToken(candidate, lastAddedToken); + begin = end; + break; + } + if (isIso8601Delimiter(candidate)) { + lastAddedToken = parseIso8601DelimiterToken(candidate); + begin = end; + break; + } + if (isTemporalToken(candidate)) { + lastAddedToken = parseTemporalToken(originalPattern, begin, candidate); + begin = end; + break; + } + if (isTimeZoneToken(candidate)) { + lastAddedToken = parseTimeZoneToken(candidate); + begin = end; + break; + } + } + } + return pattern; + } + + private boolean isSeparator(String candidate) { + return candidate.length() == 1 && VALID_SEPARATORS.contains(candidate); + } + + private boolean isIso8601Delimiter(String candidate) { + return candidate.length() == 1 && VALID_ISO_8601_DELIMITERS.contains(candidate); + } + + private boolean isTemporalToken(String candidate) { + return TEMPORAL_TOKENS.containsKey(candidate); + } + + private boolean isTimeZoneToken(String pattern) { + return TIME_ZONE_TOKENS.containsKey(pattern); + } + + private Token parseSeparatorToken(String candidate, Token lastAddedToken) { + // try to clump separator with immediately preceding separators (e.g. "---" counts as one + // separator) + if (lastAddedToken != null && lastAddedToken.type == TokenType.SEPARATOR) { + lastAddedToken.string += candidate; + lastAddedToken.length += 1; + } else { + lastAddedToken = new Token(TokenType.SEPARATOR, candidate); + tokens.add(lastAddedToken); + } + return lastAddedToken; + } + + private Token parseIso8601DelimiterToken(String candidate) { + Token lastAddedToken; + lastAddedToken = new Token(TokenType.ISO_8601_DELIMITER, candidate.toUpperCase()); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + private Token parseTemporalToken(String originalPattern, int begin, String candidate) { + Token lastAddedToken; + + // for AM/PM, keep original case + if (TEMPORAL_TOKENS.get(candidate) == ChronoField.AMPM_OF_DAY) { + int subStringEnd = begin + candidate.length(); + candidate = originalPattern.substring(begin, subStringEnd); + } + lastAddedToken = new Token(TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, + getTokenStringLength(candidate.toLowerCase())); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + private Token parseTimeZoneToken(String candidate) { + Token lastAddedToken; + lastAddedToken = new Token(TIME_ZONE_TOKENS.get(candidate), candidate, + getTokenStringLength(candidate)); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + private int getTokenStringLength(String candidate) { + Integer length = SPECIAL_LENGTHS.get(candidate); + if (length != null) { + return length; + } + return candidate.length(); + } + + /** + * Make sure the generated list of tokens is valid for parsing strings to datetime objects. + */ + private void verifyForParse() { + + // create a list of tokens' temporal fields + ArrayList temporalFields = new ArrayList<>(); + ArrayList timeZoneTemporalUnits = new ArrayList<>(); + int roundYearCount=0, yearCount=0; + for (Token token : tokens) { + if (token.temporalField != null) { + temporalFields.add(token.temporalField); + if (token.temporalField == ChronoField.YEAR) { + if (token.string.startsWith("r")) { + roundYearCount += 1; + } else { + yearCount += 1; + } + } + } else if (token.temporalUnit != null) { + timeZoneTemporalUnits.add(token.temporalUnit); + } + } + + if (roundYearCount > 0 && yearCount > 0) { + throw new IllegalArgumentException("Invalid duplication of format element: Both year and" + + "round year are provided"); + } + for (TemporalField tokenType : temporalFields) { + if (Collections.frequency(temporalFields, tokenType) > 1) { + throw new IllegalArgumentException( + "Invalid duplication of format element: multiple " + tokenType.toString() + + " tokens provided."); + } + } + if (temporalFields.contains(ChronoField.AMPM_OF_DAY) && + !(temporalFields.contains(ChronoField.HOUR_OF_DAY) || + temporalFields.contains(ChronoField.HOUR_OF_AMPM))) { + throw new IllegalArgumentException("Missing hour token."); + } + if (temporalFields.contains(ChronoField.AMPM_OF_DAY) && + temporalFields.contains(ChronoField.HOUR_OF_DAY)) { + throw new IllegalArgumentException("Conflict between median indicator and hour token."); + } + if (temporalFields.contains(ChronoField.HOUR_OF_AMPM) && + temporalFields.contains(ChronoField.HOUR_OF_DAY)) { + throw new IllegalArgumentException("Conflict between hour of day and hour of am/pm token."); + } + if (temporalFields.contains(ChronoField.DAY_OF_YEAR) && + (temporalFields.contains(ChronoField.DAY_OF_MONTH) || + temporalFields.contains(ChronoField.MONTH_OF_YEAR))) { + throw new IllegalArgumentException("Day of year provided with day or month token."); + } + if (temporalFields.contains(ChronoField.SECOND_OF_DAY) && + (temporalFields.contains(ChronoField.HOUR_OF_DAY) || + temporalFields.contains(ChronoField.HOUR_OF_AMPM) || + temporalFields.contains(ChronoField.MINUTE_OF_HOUR) || + temporalFields.contains(ChronoField.SECOND_OF_MINUTE))) { + throw new IllegalArgumentException( + "Second of day token conflicts with other token(s)."); + } + if (timeZoneTemporalUnits.contains(ChronoUnit.MINUTES) && + !timeZoneTemporalUnits.contains(ChronoUnit.HOURS)) { + throw new IllegalArgumentException("Time zone minute token provided without time zone hour token."); + } + } + + /** + * Make sure the generated list of tokens is valid for formatting datetime objects to strings. + */ + private void verifyForFormat() { + for (Token token : tokens) { + if (token.type == TokenType.TIMEZONE) { + throw new IllegalArgumentException(token.string.toUpperCase() + " not a valid format for " + + "timestamp or date."); + } + } + } + + @Override public String format(Timestamp ts) { + StringBuilder fullOutputSb = new StringBuilder(); + String outputString = null; + int value; + LocalDateTime localDateTime = + LocalDateTime.ofEpochSecond(ts.toEpochSecond(), ts.getNanos(), ZoneOffset.UTC); + for (Token token : tokens) { + switch (token.type) { + case TEMPORAL: + try { + value = localDateTime.get(token.temporalField); + outputString = formatTemporal(value, token); + } catch (DateTimeException e) { + throw new IllegalArgumentException(token.temporalField + " couldn't be obtained from " + + "LocalDateTime " + localDateTime, e); + } + break; + case TIMEZONE: //invalid for timestamp and date + throw new IllegalArgumentException(token.string.toUpperCase() + " not a valid format for " + + "timestamp or date."); + case SEPARATOR: + outputString = token.string; + break; + case ISO_8601_DELIMITER: + outputString = token.string.toUpperCase(); + break; + default: + // won't happen + } + fullOutputSb.append(outputString); + } + return fullOutputSb.toString(); + } + + @Override public String format(Date date) { + return format(Timestamp.ofEpochSecond(date.toEpochSecond())); + } + + private String formatTemporal(int value, Token token) { + String output; + if (token.temporalField == ChronoField.AMPM_OF_DAY) { + output = value == 0 ? "a" : "p"; + output += token.string.length() == 2 ? "m" : ".m."; + if (token.string.startsWith("A") || token.string.startsWith("P")) { + output = output.toUpperCase(); + } + } else { + // it's a numeric value + try { + output = String.valueOf(value); + output = padOrTruncateNumericTemporal(token, output); + } catch (Exception e) { + throw new IllegalArgumentException("Value: " + value + " couldn't be cast to string.", e); + } + } + return output; + } + + /** + * To match token.length, pad left with zeroes or truncate. + */ + private String padOrTruncateNumericTemporal(Token token, String output) { + if (output.length() < token.length) { + output = StringUtils.leftPad(output, token.length, '0'); // pad left + } else if (output.length() > token.length) { + if (token.temporalField == ChronoField.NANO_OF_SECOND) { + output = output.substring(0, token.length); // truncate right + } else { + output = output.substring(output.length() - token.length); // truncate left + } + } + if (token.temporalField == ChronoField.NANO_OF_SECOND + && token.string.equalsIgnoreCase("ff")) { + output = output.replaceAll("0*$", ""); //truncate trailing 0's + if (output.isEmpty()) { + output = "0"; + } + } + return output; + } + + /** + * Left here for timestamp with local time zone. + */ + private String formatTimeZone(TimeZone timeZone, LocalDateTime localDateTime, Token token) { + ZoneOffset offset = timeZone.toZoneId().getRules().getOffset(localDateTime); + Duration seconds = Duration.of(offset.get(ChronoField.OFFSET_SECONDS), ChronoUnit.SECONDS); + if (token.string.equals("tzh")) { + long hours = seconds.toHours(); + String s = (hours >= 0) ? "+" : "-"; + s += (Math.abs(hours) < 10) ? "0" : ""; + s += String.valueOf(Math.abs(hours)); + return s; + } else { + long minutes = Math.abs(seconds.toMinutes() % MINUTES_PER_HOUR); + String s = String.valueOf(minutes); + if (s.length() == 1) { + s = "0" + s; + } + return s; + } + } + + @Override public Timestamp parseTimestamp(String fullInput){ + LocalDateTime ldt = LocalDateTime.ofInstant(Instant.EPOCH, ZoneOffset.UTC); + String substring; + int index = 0; + int value; + int timeZoneSign = 0, timeZoneHours = 0, timeZoneMinutes = 0; + + for (Token token : tokens) { + switch (token.type) { + case TEMPORAL: + substring = getNextSubstring(fullInput, index, token); // e.g. yy-m -> yy + value = parseTemporal(substring, token); // e.g. 18->2018, July->07 + try { + ldt = ldt.with(token.temporalField, value); + } catch (DateTimeException e){ + throw new IllegalArgumentException( + "Value " + value + " not valid for token " + token.toString()); + } + index += substring.length(); + break; + case TIMEZONE: + if (token.temporalUnit == ChronoUnit.HOURS) { + String nextCharacter = fullInput.substring(index, index + 1); + timeZoneSign = "-".equals(nextCharacter) ? -1 : 1; + if ("-".equals(nextCharacter) || "+".equals(nextCharacter)) { + index++; + } + // parse next two digits + substring = getNextSubstring(fullInput, index, index + 2, token); + try { + timeZoneHours = Integer.parseInt(substring); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" with token " + token + " to int. Pattern is " + pattern, e); + } + if (timeZoneHours < -15 || timeZoneHours > 15) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" to TZH because TZH range is -15 to +15. Pattern is " + pattern); + } + } else { // time zone minutes + substring = getNextSubstring(fullInput, index, token); + try { + timeZoneMinutes = Integer.parseInt(substring); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" with token " + token + " to int. Pattern is " + pattern, e); + } + if (timeZoneMinutes < 0 || timeZoneMinutes > 59) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" to TZM because TZM range is 0 to 59. Pattern is " + pattern); + } + } + index += substring.length(); + break; + case SEPARATOR: + index = parseSeparator(fullInput, index, token); + break; + case ISO_8601_DELIMITER: + index = parseIso8601Delimiter(fullInput, index, token); + default: + //do nothing + } + } + // time zone hours -- process here because hh/hh24 may be parsed after tzh + ldt = ldt.minus(timeZoneSign * timeZoneHours, ChronoUnit.HOURS); + // time zone minutes -- process here because sign depends on tzh sign + ldt = ldt.minus( + timeZoneSign * timeZoneMinutes, ChronoUnit.MINUTES); + + // anything left unparsed at end of string? throw error + if (!fullInput.substring(index).isEmpty()) { + throw new IllegalArgumentException("Leftover input after parsing: " + + fullInput.substring(index) + " in string " + fullInput); + } + + return Timestamp.ofEpochSecond(ldt.toEpochSecond(ZoneOffset.UTC), ldt.getNano()); + } + + public Date parseDate(String input){ + return Date.ofEpochMilli(parseTimestamp(input).toEpochMilli()); + } + /** + * Return the next substring to parse. Length is either specified or token.length, but a + * separator or an ISO-8601 delimiter can cut the substring short. (e.g. if the token pattern is + * "YYYY" we expect the next 4 characters to be 4 numbers. However, if it is "976/" then we + * return "976" because a separator cuts it short.) + */ + private String getNextSubstring(String s, int begin, Token token) { + return getNextSubstring(s, begin, begin + token.length, token); + } + + private String getNextSubstring(String s, int begin, int end, Token token) { + if (end > s.length()) { + end = s.length(); + } + s = s.substring(begin, end); + if (token.temporalField == ChronoField.AMPM_OF_DAY) { + if (s.charAt(1) == 'm' || s.charAt(1) == 'M') { // length 2 + return s.substring(0, 2); + } else { + return s; + } + } + for (String sep : VALID_SEPARATORS) { + if (s.contains(sep)) { + s = s.substring(0, s.indexOf(sep)); + } + } + // TODO this will cause problems with DAY (for example, Thursday starts with T) + for (String delimiter : VALID_ISO_8601_DELIMITERS) { + if (s.toLowerCase().contains(delimiter)) { + s = s.substring(0, s.toLowerCase().indexOf(delimiter)); + } + } + + return s; + } + + /** + * Get the integer value of a temporal substring. + */ + private int parseTemporal(String substring, Token token){ + // exceptions to the rule + if (token.temporalField == ChronoField.AMPM_OF_DAY) { + return substring.toLowerCase().startsWith("a") ? AM : PM; + + } else if (token.temporalField == ChronoField.YEAR) { + String currentYearString = String.valueOf(LocalDateTime.now().getYear()); + //deal with round years + if (token.string.startsWith("r") && substring.length() == 2) { + int currFirst2Digits = Integer.parseInt(currentYearString.substring(0, 2)); + int currLast2Digits = Integer.parseInt(currentYearString.substring(2)); + int valLast2Digits = Integer.parseInt(substring); + if (valLast2Digits < FIFTY && currLast2Digits >= FIFTY) { + currFirst2Digits += 1; + } else if (valLast2Digits >= FIFTY && currLast2Digits < FIFTY) { + currFirst2Digits -= 1; + } + substring = String.valueOf(currFirst2Digits) + substring; + } else { // fill in prefix digits with current date + substring = currentYearString.substring(0, 4 - substring.length()) + substring; + } + + } else if (token.temporalField == ChronoField.NANO_OF_SECOND) { + int i = Integer.min(token.length, substring.length()); + substring += StringUtils.repeat("0", NANOS_MAX_LENGTH - i); + } + + // the rule + try { + return Integer.parseInt(substring); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" with token " + token + " to integer. Pattern is " + pattern, e); + } + } + + /** + * Parse the next separator(s). At least one separator character is expected. Separator + * characters are interchangeable. + * + * Caveat: If the last separator character in the separator substring is "-" and is immediately + * followed by a time zone hour (tzh) token, it's a negative sign and not counted as a + * separator, UNLESS this is the only separator character in the separator substring (in + * which case it is not counted as the negative sign). + * + * @throws IllegalArgumentException if separator is missing + */ + private int parseSeparator(String fullInput, int index, Token token){ + int separatorsFound = 0; + int begin = index; + + while (index < fullInput.length() && + VALID_SEPARATORS.contains(fullInput.substring(index, index + 1))) { + if (!isLastCharacterOfSeparator(index, fullInput) || !(nextTokenIs("tzh", token)) + || separatorsFound == 0) { + separatorsFound++; + } + index++; + } + + if (separatorsFound == 0) { + throw new IllegalArgumentException("Missing separator at index " + index); + } + return begin + separatorsFound; + } + + private int parseIso8601Delimiter(String fullInput, int index, Token token) { + String substring; + substring = fullInput.substring(index, index + 1); + if (token.string.equalsIgnoreCase(substring)) { + index++; + } else { + throw new IllegalArgumentException( + "Missing ISO 8601 delimiter " + token.string.toUpperCase()); + } + return index; + } + + /** + * Is the next character something other than a separator? + */ + private boolean isLastCharacterOfSeparator(int index, String string) { + if (index == string.length() - 1) { // if we're at the end of the string, yes + return true; + } + return !VALID_SEPARATORS.contains(string.substring(index + 1, index + 2)); + } + + /** + * Does the temporalUnit/temporalField of the next token match the pattern's? + */ + private boolean nextTokenIs(String pattern, Token currentToken) { + // make sure currentToken isn't the last one + if (tokens.indexOf(currentToken) == tokens.size() - 1) { + return false; + } + Token nextToken = tokens.get(tokens.indexOf(currentToken) + 1); + pattern = pattern.toLowerCase(); + return (isTimeZoneToken(pattern) && TIME_ZONE_TOKENS.get(pattern) == nextToken.temporalUnit + || isTemporalToken(pattern) && TEMPORAL_TOKENS.get(pattern) == nextToken.temporalField); + } + + @Override public String getPattern() { + return pattern; + } + + /** + * @return a copy of token list + */ + protected List getTokens() { + return new ArrayList<>(tokens); + } +} diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/package-info.java common/src/java/org/apache/hadoop/hive/common/format/datetime/package-info.java new file mode 100644 index 0000000000..1e838be886 --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Deals with formatting and parsing of datetime objects. + */ +package org.apache.hadoop.hive.common.format.datetime; diff --git common/src/java/org/apache/hadoop/hive/common/type/Date.java common/src/java/org/apache/hadoop/hive/common/type/Date.java index 6ecfcf65c9..c1eb47153e 100644 --- common/src/java/org/apache/hadoop/hive/common/type/Date.java +++ common/src/java/org/apache/hadoop/hive/common/type/Date.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

+ * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,6 +17,9 @@ */ package org.apache.hadoop.hive.common.type; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; + import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; @@ -72,6 +75,17 @@ public String toString() { return localDate.format(PRINT_FORMATTER); } + public String toStringFormatted(HiveDateTimeFormatter formatter) { + if (formatter == null) { + return toString(); + } + try { + return formatter.format(this); + } catch (IllegalArgumentException e) { + return null; + } + } + public int hashCode() { return localDate.hashCode(); } @@ -137,6 +151,13 @@ public static Date valueOf(String s) { return new Date(localDate); } + public static Date valueOf(String s, HiveDateTimeFormatter formatter) { + if (formatter == null) { + return valueOf(s); + } + return formatter.parseDate(s); + } + public static Date ofEpochDay(int epochDay) { return new Date(LocalDate.ofEpochDay(epochDay)); } diff --git common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java index a8b7b6d186..cea1e8c2e1 100644 --- common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java +++ common/src/java/org/apache/hadoop/hive/common/type/Timestamp.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

+ * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hive.common.type; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; + import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneOffset; @@ -101,6 +103,17 @@ public String toString() { return localDateTime.format(PRINT_FORMATTER); } + public String toStringFormatted(HiveDateTimeFormatter formatter) { + if (formatter == null) { + return toString(); + } + try { + return formatter.format(this); + } catch (IllegalArgumentException e) { + return null; + } + } + public int hashCode() { return localDateTime.hashCode(); } @@ -166,6 +179,13 @@ public static Timestamp valueOf(String s) { return new Timestamp(localDateTime); } + public static Timestamp valueOf(String s, HiveDateTimeFormatter formatter) { + if (formatter == null) { + return valueOf(s); + } + return formatter.parseTimestamp(s); + } + public static Timestamp ofEpochSecond(long epochSecond) { return ofEpochSecond(epochSecond, 0); } diff --git common/src/java/org/apache/hadoop/hive/common/type/TimestampUtils.java common/src/java/org/apache/hadoop/hive/common/type/TimestampUtils.java index f26f8ae01e..525c95a63d 100644 --- common/src/java/org/apache/hadoop/hive/common/type/TimestampUtils.java +++ common/src/java/org/apache/hadoop/hive/common/type/TimestampUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.common.type; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import java.math.BigDecimal; @@ -171,6 +172,18 @@ public static long millisToSeconds(long millis) { private static final int DATE_LENGTH = "YYYY-MM-DD".length(); + public static Timestamp stringToTimestamp(String s, HiveDateTimeFormatter formatter) { + if (formatter == null) { + return stringToTimestamp(s); + } + + try { + return Timestamp.valueOf(s, formatter); + } catch (IllegalArgumentException e) { + return null; + } + } + public static Timestamp stringToTimestamp(String s) { s = s.trim(); // Handle simpler cases directly avoiding exceptions diff --git common/src/java/org/apache/hive/common/util/DateParser.java common/src/java/org/apache/hive/common/util/DateParser.java index 5db14f1906..22bcd98c1d 100644 --- common/src/java/org/apache/hive/common/util/DateParser.java +++ common/src/java/org/apache/hive/common/util/DateParser.java @@ -17,6 +17,7 @@ */ package org.apache.hive.common.util; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.common.type.Date; /** @@ -36,9 +37,13 @@ public Date parseDate(String strValue) { } public boolean parseDate(String strValue, Date result) { + return parseDate(strValue, result, null); + } + + public boolean parseDate(String strValue, Date result, HiveDateTimeFormatter formatter) { Date parsedVal; try { - parsedVal = Date.valueOf(strValue); + parsedVal = Date.valueOf(strValue, formatter); } catch (IllegalArgumentException e) { parsedVal = null; } diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveJavaDateTimeFormatter.java common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveJavaDateTimeFormatter.java new file mode 100644 index 0000000000..82009f08e1 --- /dev/null +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveJavaDateTimeFormatter.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import org.apache.hadoop.hive.common.type.Timestamp; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoField; + +/** + * Test class for HiveJavaDateTimeFormatter. + */ +public class TestHiveJavaDateTimeFormatter { + + private static final DateTimeFormatter DATE_TIME_FORMATTER; + static { + DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); + builder.append(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); + builder.optionalStart().appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true).optionalEnd(); + DATE_TIME_FORMATTER = builder.toFormatter(); + } + private HiveDateTimeFormatter formatter = new HiveJavaDateTimeFormatter(DATE_TIME_FORMATTER); + + @Test + public void testFormat() { + Timestamp ts = Timestamp.valueOf("2019-01-01 00:00:00.99999"); + Assert.assertEquals("2019-01-01 00:00:00.99999", formatter.format(ts)); + } + + @Test + public void testParse() { + String s = "2019-01-01 00:00:00.99999"; + Assert.assertEquals(Timestamp.valueOf("2019-01-01 00:00:00.99999"), + formatter.parseTimestamp(s)); + } + +} diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSimpleDateFormatter.java common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSimpleDateFormatter.java new file mode 100644 index 0000000000..d189c7b042 --- /dev/null +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSimpleDateFormatter.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import org.apache.hadoop.hive.common.type.Timestamp; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.text.SimpleDateFormat; +import java.time.ZoneOffset; +import java.util.TimeZone; + +/** + * Tests HiveSimpleDateFormatter. + */ +public class TestHiveSimpleDateFormatter { + + private HiveDateTimeFormatter formatter = + new HiveSimpleDateFormatter("yyyy-MM-dd HH:mm:ss", TimeZone.getTimeZone(ZoneOffset.UTC)); + + @Test + public void testFormat() { + verifyFormat("2019-01-01 01:01:01"); + verifyFormat("2019-01-01 00:00:00"); + verifyFormat("1960-01-01 23:00:00"); + } + + private void verifyFormat(String s) { + Timestamp ts = Timestamp.valueOf(s); + Assert.assertEquals(s, formatter.format(ts)); + } + + @Test + public void testParse() { + verifyParse("2019-01-01 01:10:10"); + verifyParse("1960-01-01 23:00:00"); + + } + + private void verifyParse(String s) { + Timestamp ts = Timestamp.valueOf(s); + Assert.assertEquals(ts, formatter.parseTimestamp(s)); + } +} diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java new file mode 100644 index 0000000000..1557f41032 --- /dev/null +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java @@ -0,0 +1,308 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import com.sun.tools.javac.util.List; +import junit.framework.TestCase; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.ResolverStyle; +import java.time.format.SignStyle; +import java.time.temporal.ChronoField; +import java.time.temporal.TemporalField; +import java.util.ArrayList; + +import static java.time.temporal.ChronoField.DAY_OF_MONTH; +import static java.time.temporal.ChronoField.HOUR_OF_DAY; +import static java.time.temporal.ChronoField.MINUTE_OF_HOUR; +import static java.time.temporal.ChronoField.MONTH_OF_YEAR; +import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; +import static java.time.temporal.ChronoField.YEAR; + +/** + * Tests HiveSqlDateTimeFormatter. + */ + +public class TestHiveSqlDateTimeFormatter extends TestCase { + + private HiveSqlDateTimeFormatter formatter; + + public void testSetPattern() { + verifyPatternParsing(" ---yyyy-\'-:- -,.;/MM-dd--", new ArrayList<>(List.of( + null, // represents separator, which has no temporal field + ChronoField.YEAR, + null, + ChronoField.MONTH_OF_YEAR, + null, + ChronoField.DAY_OF_MONTH, + null + ))); + + verifyPatternParsing("ymmdddhh24::mi:ss A.M. pm", 25, "ymmdddhh24::mi:ss A.M. pm", + new ArrayList<>(List.of( + ChronoField.YEAR, + ChronoField.MONTH_OF_YEAR, + ChronoField.DAY_OF_YEAR, + ChronoField.HOUR_OF_DAY, + null, ChronoField.MINUTE_OF_HOUR, + null, ChronoField.SECOND_OF_MINUTE, + null, ChronoField.AMPM_OF_DAY, + null, ChronoField.AMPM_OF_DAY + ))); + } + + public void testSetPatternWithBadPatterns() { + verifyBadPattern("e", true); + verifyBadPattern("yyyy-1", true); + + verifyBadPattern("yyyy Y", true); + verifyBadPattern("yyyy R", true); + verifyBadPattern("yyyy-MM-DDD", true); + verifyBadPattern("yyyy-mm-DD DDD", true); + verifyBadPattern("yyyy-mm-dd HH24 HH12", true); + verifyBadPattern("yyyy-mm-dd HH24 AM", true); + verifyBadPattern("yyyy-mm-dd HH24 SSSSS", true); + verifyBadPattern("yyyy-mm-dd HH12 SSSSS", true); + verifyBadPattern("yyyy-mm-dd SSSSS AM", true); + verifyBadPattern("yyyy-mm-dd MI SSSSS", true); + verifyBadPattern("yyyy-mm-dd SS SSSSS", true); + + verifyBadPattern("tzm", false); + verifyBadPattern("tzh", false); + } + + public void testFormatTimestamp() { + checkFormatTs("rr rrrr ddd", "2018-01-03 00:00:00", "18 2018 003"); + checkFormatTs("yyyy-mm-ddtsssss.ff4z", "2018-02-03 00:00:10.777777777", "2018-02-03T00010.7777Z"); + checkFormatTs("hh24:mi:ss.ff1", "2018-02-03 01:02:03.999999999", "01:02:03.9"); + checkFormatTs("y yyy hh:mi:ss.ffz", "2018-02-03 01:02:03.0070070", "8 018 01:02:03.007007Z"); + checkFormatTs("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03 01:02:03.0070070", "am a.m. am a.m. AM A.M. AM A.M."); + } + + private void checkFormatTs(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, false); + assertEquals(expectedOutput, formatter.format(toTimestamp(input))); + } + + public void testFormatDate() { + checkFormatDate("rr rrrr ddd", "2018-01-03", "18 2018 003"); + checkFormatDate("yyyy-mm-ddtsssss.ff4z", "2018-02-03", "2018-02-03T00000.0000Z"); + checkFormatDate("hh24:mi:ss.ff1", "2018-02-03", "00:00:00.0"); + checkFormatDate("y yyy T hh:mi:ss.ffz", "2018-02-03", "8 018 T 00:00:00.0Z"); + checkFormatDate("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03", "am a.m. am a.m. AM A.M. AM A.M."); + checkFormatDate("DDD", "2019-12-31", "365"); + checkFormatDate("DDD", "2020-12-31", "366"); + } + + private void checkFormatDate(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, false); + assertEquals(expectedOutput, formatter.format(toDate(input))); + } + + public void testParseTimestamp() { + String thisYearString = String.valueOf(LocalDateTime.now().getYear()); + int firstTwoDigits = getFirstTwoDigits(); + + //y + checkParseTimestamp("y-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03 00:00:00"); + checkParseTimestamp("yy-mm-dd", "00-02-03", thisYearString.substring(0, 2) + "00-02-03 00:00:00"); + checkParseTimestamp("yyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03 00:00:00"); + checkParseTimestamp("yyyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03 00:00:00"); + + //rr, rrrr + checkParseTimestamp("rr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03 00:00:00"); + + //everything else + checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ff8z", "2018-02-03T04:05:06.5665Z", "2018-02-03 04:05:06.5665"); + checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff", "2018-02-03 04:05:06.555555555", "2018-02-03 04:05:06.555555555"); + checkParseTimestamp("yyyy-mm-dd hh12:mi:ss", "2099-2-03 04:05:06", "2099-02-03 04:05:06"); + checkParseTimestamp("yyyyddd", "2018284", "2018-10-11 00:00:00"); + checkParseTimestamp("yyyyddd", "20184", "2018-01-04 00:00:00"); + checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ffz", "2018-02-03t04:05:06.444Z", "2018-02-03 04:05:06.444"); + checkParseTimestamp("hh:mi:ss A.M.", "04:05:06 P.M.", "1970-01-01 16:05:06"); + checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00--1:-30", "2019-01-01 15:30:00"); + checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00-1:30", "2019-01-01 12:30:00"); + checkParseTimestamp("TZM:TZH", "1 -3", "1970-01-01 03:01:00"); + checkParseTimestamp("TZH:TZM", "-0:30", "1970-01-01 00:30:00"); + checkParseTimestamp("TZM/YYY-MM-TZH/DD", "0/333-01-11/02", "2333-01-01 13:00:00"); + checkParseTimestamp("YYYY-MM-DD HH12:MI AM", "2019-01-01 11:00 p.m.", "2019-01-01 23:00:00"); + checkParseTimestamp("YYYY-MM-DD HH12:MI A.M..", "2019-01-01 11:00 pm.", "2019-01-01 23:00:00"); + + //Test "day in year" token in a leap year scenario + checkParseTimestamp("YYYY DDD", "2000 60", "2000-02-29 00:00:00"); + checkParseTimestamp("YYYY DDD", "2000 61", "2000-03-01 00:00:00"); + checkParseTimestamp("YYYY DDD", "2000 366", "2000-12-31 00:00:00"); + //Test timezone offset parsing without separators + checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM+0515", "2018-12-31 02:45:00"); + checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM0515", "2018-12-31 02:45:00"); + checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM-0515", "2018-12-31 13:15:00"); + } + + private int getFirstTwoDigits() { + int thisYear = LocalDateTime.now().getYear(); + int firstTwoDigits = thisYear / 100; + if (thisYear % 100 < 50) { + firstTwoDigits -= 1; + } + return firstTwoDigits; + } + + private void checkParseTimestamp(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, true); + assertEquals(toTimestamp(expectedOutput), formatter.parseTimestamp(input)); + } + + public void testParseDate() { + + String thisYearString = String.valueOf(LocalDateTime.now().getYear()); + int firstTwoDigits = getFirstTwoDigits(); + //y + checkParseDate("y-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03"); + checkParseDate("yy-mm-dd", "00-02-03", thisYearString.substring(0, 2) + "00-02-03"); + checkParseDate("yyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03"); + checkParseDate("yyyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03"); + checkParseDate("rr-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03"); + checkParseDate("rrrr-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03"); + + //rr, rrrr + checkParseDate("rr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03"); + checkParseDate("rr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03"); + checkParseDate("rr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03"); + checkParseDate("rr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03"); + checkParseDate("rrrr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03"); + checkParseDate("rrrr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03"); + checkParseDate("rrrr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03"); + checkParseDate("rrrr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03"); + + checkParseDate("yyyy-mm-dd hh mi ss.ff7", "2018/01/01 2.2.2.55", "2018-01-01"); + } + + private void checkParseDate(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, true); + assertEquals(toDate(expectedOutput), formatter.parseDate(input)); + } + + public void testParseTimestampError() { + verifyBadParseString("yyyy", "2019-02-03"); + verifyBadParseString("yyyy-mm-dd ", "2019-02-03"); //separator missing + verifyBadParseString("yyyy-mm-dd", "2019-02-03..."); //extra separators + verifyBadParseString("yyyy-mm-dd hh12:mi:ss", "2019-02-03 14:00:00"); //hh12 out of range + verifyBadParseString("yyyy-dddsssss", "2019-912345"); + verifyBadParseString("yyyy-mm-dd", "2019-13-23"); //mm out of range + verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +16:00"); //tzh out of range + verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +14:60"); //tzm out of range + verifyBadParseString("YYYY DDD", "2000 367"); //ddd out of range + } + + private void verifyBadPattern(String string, boolean forParsing) { + try { + formatter = new HiveSqlDateTimeFormatter(string, forParsing); + fail(); + } catch (Exception e) { + assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + } + } + + /** + * Verify pattern is parsed correctly. + * Check: + * -token.temporalField for each token + * -sum of token.lengths + * -concatenation of token.strings + */ + private void verifyPatternParsing(String pattern, ArrayList temporalFields) { + verifyPatternParsing(pattern, pattern.length(), pattern.toLowerCase(), temporalFields); + } + + private void verifyPatternParsing(String pattern, int expectedPatternLength, + String expectedPattern, ArrayList temporalFields) { + formatter = new HiveSqlDateTimeFormatter(pattern, false); + assertEquals(temporalFields.size(), formatter.getTokens().size()); + StringBuilder sb = new StringBuilder(); + int actualPatternLength = 0; + for (int i = 0; i < temporalFields.size(); i++) { + assertEquals("Generated list of tokens not correct", temporalFields.get(i), + formatter.getTokens().get(i).temporalField); + sb.append(formatter.getTokens().get(i).string); + actualPatternLength += formatter.getTokens().get(i).length; + } + assertEquals("Token strings concatenated don't match original pattern string", + expectedPattern, sb.toString()); + assertEquals(expectedPatternLength, actualPatternLength); + } + + private void verifyBadParseString(String pattern, String string) { + try { + formatter = new HiveSqlDateTimeFormatter(pattern, true); + formatter.parseTimestamp(string); + fail(); + } catch (Exception e) { + assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + } + } + + + // Methods that construct datetime objects using java.time.DateTimeFormatter. + + public static Date toDate(String s) { + LocalDate localDate = LocalDate.parse(s, DATE_FORMATTER); + return Date.ofEpochDay((int) localDate.toEpochDay()); + } + + /** + * This is effectively the old Timestamp.valueOf method. + */ + public static Timestamp toTimestamp(String s) { + LocalDateTime localDateTime = LocalDateTime.parse(s.trim(), TIMESTAMP_FORMATTER); + return Timestamp.ofEpochSecond( + localDateTime.toEpochSecond(ZoneOffset.UTC), localDateTime.getNano()); + } + + private static final DateTimeFormatter DATE_FORMATTER = + DateTimeFormatter.ofPattern("yyyy-MM-dd"); + private static final DateTimeFormatter TIMESTAMP_FORMATTER; + static { + DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); + builder.appendValue(YEAR, 1, 10, SignStyle.NORMAL).appendLiteral('-') + .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NORMAL).appendLiteral('-') + .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NORMAL) + .optionalStart().appendLiteral(" ") + .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NORMAL).appendLiteral(':') + .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NORMAL).appendLiteral(':') + .appendValue(SECOND_OF_MINUTE, 1, 2, SignStyle.NORMAL) + .optionalStart().appendFraction(ChronoField.NANO_OF_SECOND, 1, 9, true).optionalEnd() + .optionalEnd(); + TIMESTAMP_FORMATTER = builder.toFormatter().withResolverStyle(ResolverStyle.LENIENT); + } +} diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/package-info.java common/src/test/org/apache/hadoop/hive/common/format/datetime/package-info.java new file mode 100644 index 0000000000..70ee4266f4 --- /dev/null +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests formatting and parsing of datetime objects. + */ +package org.apache.hadoop.hive.common.format.datetime; diff --git common/src/test/org/apache/hive/common/util/TestTimestampParser.java common/src/test/org/apache/hive/common/util/TestTimestampParser.java index 00a7904ecf..5bf1119cef 100644 --- common/src/test/org/apache/hive/common/util/TestTimestampParser.java +++ common/src/test/org/apache/hive/common/util/TestTimestampParser.java @@ -116,8 +116,7 @@ public void testPattern1() { }; String[] invalidCases = { - "1945-12-31-23:59:59", - "12345", + "12345" }; testValidCases(tp, validCases); @@ -147,8 +146,7 @@ public void testMillisParser() { }; String[] invalidCases = { - "1945-12-31-23:59:59", - "1420509274123-", + "1420509274123-" }; testValidCases(tp, validCases); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index d08b05fb68..c09db9af65 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -528,6 +528,7 @@ system.registerGenericUDF("to_epoch_milli", GenericUDFEpochMilli.class); system.registerGenericUDF("bucket_number", GenericUDFBucketNumber.class); system.registerGenericUDF("tumbling_window", GenericUDFTumbledWindow.class); + system.registerGenericUDF("cast_format", GenericUDFCastFormat.class); // Generic UDTF's system.registerGenericUDTF("explode", GenericUDTFExplode.class); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index fa9d1e9783..465464167f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -42,8 +42,11 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.CastBooleanToVarCharViaLongToVarChar; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastCharToBinary; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDateToChar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDateToCharWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDateToString; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDateToStringWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDateToVarChar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDateToVarCharWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDecimalToChar; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDecimalToDecimal; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastDecimalToString; @@ -65,12 +68,17 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.CastStringGroupToChar; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastStringGroupToVarChar; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastStringToBoolean; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastStringToDateWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastStringToDecimal; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastStringToTimestampWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToChar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToCharWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToDecimal; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToDouble; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToString; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToStringWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToVarChar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.CastTimestampToVarCharWithFormat; import org.apache.hadoop.hive.ql.exec.vector.expressions.ConstantVectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.ConvertDecimal64ToDecimal; import org.apache.hadoop.hive.ql.exec.vector.expressions.Decimal64ColumnInList; @@ -155,6 +163,7 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.TimestampColumnNotBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.VarCharColumnBetween; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.VarCharColumnNotBetween; +import org.apache.hadoop.hive.serde.serdeConstants; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.type.DataTypePhysicalVariation; @@ -2290,6 +2299,8 @@ private VectorExpression getGenericUdfVectorExpression(GenericUDF udf, ve = new BucketNumExpression(outCol); ve.setInputTypeInfos(returnType); ve.setOutputTypeInfo(returnType); + } else if (udf instanceof GenericUDFCastFormat) { + ve = getCastWithFormat(udf, childExpr, returnType); } if (ve != null) { return ve; @@ -3300,6 +3311,54 @@ private VectorExpression getCastToLongExpression(List childExpr, P return null; } + private VectorExpression getCastWithFormat( + GenericUDF udf, List childExpr, TypeInfo returnType) throws HiveException { + String inputType = childExpr.get(1).getTypeString(); + childExpr.remove(0); // index 0 not needed since we know returnType + + Class veClass = getCastFormatVectorExpressionClass(childExpr, returnType, inputType); + return createVectorExpression( + veClass, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType); + } + + private Class getCastFormatVectorExpressionClass(List childExpr, + TypeInfo returnType, String inputType) throws HiveException { + switch (inputType) { + case serdeConstants.TIMESTAMP_TYPE_NAME: + if (returnType.getTypeName().equals(serdeConstants.STRING_TYPE_NAME)) { + return CastTimestampToStringWithFormat.class; + } + if (returnType.getTypeName().startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { + return CastTimestampToVarCharWithFormat.class; + } + if (returnType.getTypeName().startsWith(serdeConstants.CHAR_TYPE_NAME)) { + return CastTimestampToCharWithFormat.class; + } + case serdeConstants.DATE_TYPE_NAME: + if (returnType.getTypeName().equals(serdeConstants.STRING_TYPE_NAME)) { + return CastDateToStringWithFormat.class; + } + if (returnType.getTypeName().startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { + return CastDateToVarCharWithFormat.class; + } + if (returnType.getTypeName().startsWith(serdeConstants.CHAR_TYPE_NAME)) { + return CastDateToCharWithFormat.class; + } + } + if (inputType.equals(serdeConstants.STRING_TYPE_NAME) + || inputType.startsWith(serdeConstants.CHAR_TYPE_NAME) + || inputType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { + switch (returnType.getTypeName()) { + case serdeConstants.TIMESTAMP_TYPE_NAME: + return CastStringToTimestampWithFormat.class; + case serdeConstants.DATE_TYPE_NAME: + return CastStringToDateWithFormat.class; + } + } + throw new HiveException( + "Expression cast " + inputType + " to " + returnType + " format not" + " vectorizable"); + } + private VectorExpression tryDecimal64Between(VectorExpressionDescriptor.Mode mode, boolean isNot, ExprNodeDesc colExpr, List childrenAfterNot, TypeInfo returnTypeInfo) throws HiveException { diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToCharWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToCharWithFormat.java new file mode 100644 index 0000000000..3093dd7ecf --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToCharWithFormat.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO CHAR() WITH FORMAT ). + */ +public class CastDateToCharWithFormat extends CastDateToChar { + + private static final long serialVersionUID = 1L; + private HiveDateTimeFormatter formatter; + + public CastDateToCharWithFormat() { + super(); + } + + public CastDateToCharWithFormat(int inputColumn, byte[] patternBytes, int len, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to char with format )," + + " but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), false); + } + + @Override + protected void func(BytesColumnVector outV, long[] vector, int i) { + super.func(outV, vector, i, formatter); + } + + @Override + public String vectorExpressionParameters() { + return super.vectorExpressionParameters() + ", format pattern: " + formatter.getPattern(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToString.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToString.java index dfa9f8a00d..d206bbb00a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToString.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToString.java @@ -18,28 +18,31 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSimpleDateFormatter; +import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.serde2.io.DateWritableV2; import java.sql.Date; -import java.text.SimpleDateFormat; import java.util.TimeZone; public class CastDateToString extends LongToStringUnaryUDF { private static final long serialVersionUID = 1L; protected transient Date dt = new Date(0); - private transient SimpleDateFormat formatter; + private transient HiveDateTimeFormatter formatter; public CastDateToString() { super(); - formatter = new SimpleDateFormat("yyyy-MM-dd"); - formatter.setTimeZone(TimeZone.getTimeZone("UTC")); + initFormatter(); } public CastDateToString(int inputColumn, int outputColumnNum) { super(inputColumn, outputColumnNum); - formatter = new SimpleDateFormat("yyyy-MM-dd"); - formatter.setTimeZone(TimeZone.getTimeZone("UTC")); + initFormatter(); + } + + public void initFormatter() { + formatter = new HiveSimpleDateFormatter("yyyy-MM-dd", TimeZone.getTimeZone("UTC")); } // The assign method will be overridden for CHAR and VARCHAR. @@ -47,10 +50,23 @@ protected void assign(BytesColumnVector outV, int i, byte[] bytes, int length) { outV.setVal(i, bytes, 0, length); } + private void assignNull(BytesColumnVector outV, int i) { + outV.isNull[i] = true; + outV.noNulls = false; + } + @Override protected void func(BytesColumnVector outV, long[] vector, int i) { - dt.setTime(DateWritableV2.daysToMillis((int) vector[i])); - byte[] temp = formatter.format(dt).getBytes(); - assign(outV, i, temp, temp.length); + func(outV, vector, i, formatter); + } + + protected void func(BytesColumnVector outV, long[] vector, int i, HiveDateTimeFormatter formatter) { + try { + byte[] temp = formatter.format( + org.apache.hadoop.hive.common.type.Date.ofEpochDay((int) vector[i])).getBytes(); + assign(outV, i, temp, temp.length); + } catch (Exception e) { + assignNull(outV, i); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToStringWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToStringWithFormat.java new file mode 100644 index 0000000000..e0321159df --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToStringWithFormat.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO STRING WITH FORMAT ). + */ +public class CastDateToStringWithFormat extends CastDateToString { + private static final long serialVersionUID = 1L; + protected transient Date dt; + private HiveDateTimeFormatter formatter; + + public CastDateToStringWithFormat() { + super(); + } + + public CastDateToStringWithFormat(int inputColumn, byte[] patternBytes, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to string with format )," + + " but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), false); + } + + // The assign method will be overridden for CHAR and VARCHAR. + protected void assign(BytesColumnVector outV, int i, byte[] bytes, int length) { + outV.setVal(i, bytes, 0, length); + } + + @Override + protected void func(BytesColumnVector outV, long[] vector, int i) { + super.func(outV, vector, i, formatter); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToVarCharWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToVarCharWithFormat.java new file mode 100644 index 0000000000..c84f223b82 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDateToVarCharWithFormat.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO VARCHAR() WITH FORMAT ). + */ +public class CastDateToVarCharWithFormat extends CastDateToVarChar { + + private static final long serialVersionUID = 1L; + private HiveDateTimeFormatter formatter; + + public CastDateToVarCharWithFormat() { + super(); + } + + public CastDateToVarCharWithFormat(int inputColumn, byte[] patternBytes, int len, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to varchar with format )," + + " but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), false); + } + + @Override + protected void func(BytesColumnVector outV, long[] vector, int i) { + super.func(outV, vector, i, formatter); + } + + @Override + public String vectorExpressionParameters() { + return super.vectorExpressionParameters() + ", format pattern: " + formatter.getPattern(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToDate.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToDate.java index a6dff12e1a..44a451b3bc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToDate.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToDate.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -151,10 +152,21 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { } } - private void evaluate(LongColumnVector outputColVector, BytesColumnVector inV, int i) { + /** + * Used by CastStringToDate. + */ + protected void evaluate(LongColumnVector outputColVector, BytesColumnVector inV, int i) { + evaluate(outputColVector, inV, i, null); + } + + /** + * Used by CastStringToDateWithFormat. + */ + protected void evaluate(LongColumnVector outputColVector, BytesColumnVector inV, int i, + HiveDateTimeFormatter formatter) { String dateString = new String(inV.vector[i], inV.start[i], inV.length[i], StandardCharsets.UTF_8); Date hDate = new Date(); - if (dateParser.parseDate(dateString, hDate)) { + if (dateParser.parseDate(dateString, hDate, formatter)) { outputColVector.vector[i] = DateWritableV2.dateToDays(hDate); return; } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToDateWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToDateWithFormat.java new file mode 100644 index 0000000000..e242ad5ea2 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToDateWithFormat.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO DATE WITH FORMAT ). + */ +public class CastStringToDateWithFormat extends CastStringToDate { + + private static final long serialVersionUID = 1L; + private HiveDateTimeFormatter formatter; + + public CastStringToDateWithFormat() { + super(); + } + + public CastStringToDateWithFormat(int inputColumn, byte[] patternBytes, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to date with format )," + + " but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), true); + } + + @Override + protected void evaluate(LongColumnVector outputColVector, + BytesColumnVector inputColVector, int i) { + super.evaluate(outputColVector, inputColVector, i, formatter); + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + VectorExpressionDescriptor.Builder b = new VectorExpressionDescriptor.Builder(); + b.setMode(VectorExpressionDescriptor.Mode.PROJECTION) + .setNumArguments(2) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.STRING_FAMILY, + VectorExpressionDescriptor.ArgumentType.STRING) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN, + VectorExpressionDescriptor.InputExpressionType.SCALAR); + return b.build(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToTimestamp.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToTimestamp.java index b48b0136eb..f8d81cdb13 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToTimestamp.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToTimestamp.java @@ -19,8 +19,9 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; import java.util.Arrays; -import java.sql.Timestamp; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; @@ -143,21 +144,40 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { } } - private void evaluate(TimestampColumnVector outputColVector, BytesColumnVector inputColVector, int i) { + /** + * This is used by CastStringToTimestamp. + */ + protected void evaluate(TimestampColumnVector outputColVector, BytesColumnVector inputColVector, int i) { + evaluate(outputColVector, inputColVector, i, null); + } + + /** + * This is used by CastStringToTimestampWithFormat. + */ + protected void evaluate(TimestampColumnVector outputColVector, BytesColumnVector inputColVector, + int i, HiveDateTimeFormatter formatter) { try { - org.apache.hadoop.hive.common.type.Timestamp timestamp = - PrimitiveObjectInspectorUtils.getTimestampFromString( + Timestamp timestamp = PrimitiveObjectInspectorUtils.getTimestampFromString( new String( inputColVector.vector[i], inputColVector.start[i], inputColVector.length[i], - "UTF-8")); - outputColVector.set(i, timestamp.toSqlTimestamp()); + "UTF-8"), + formatter); + if (timestamp != null) { + outputColVector.set(i, timestamp.toSqlTimestamp()); + } else { + setNullValue(outputColVector, i); + } } catch (Exception e) { - outputColVector.setNullValue(i); - outputColVector.isNull[i] = true; - outputColVector.noNulls = false; + setNullValue(outputColVector, i); } } + private void setNullValue(TimestampColumnVector outputColVector, int i) { + outputColVector.setNullValue(i); + outputColVector.isNull[i] = true; + outputColVector.noNulls = false; + } + @Override public String vectorExpressionParameters() { return getColumnParamString(0, inputColumn); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToTimestampWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToTimestampWithFormat.java new file mode 100644 index 0000000000..eeeaa5d935 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastStringToTimestampWithFormat.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO TIMESTAMP WITH FORMAT ). + */ +public class CastStringToTimestampWithFormat extends CastStringToTimestamp { + + private static final long serialVersionUID = 1L; + private HiveDateTimeFormatter formatter; + + public CastStringToTimestampWithFormat() { + super(); + } + + public CastStringToTimestampWithFormat(int inputColumn, byte[] patternBytes, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to timestamp with format" + + "), but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), true); + } + + @Override + protected void evaluate(TimestampColumnVector outputColVector, + BytesColumnVector inputColVector, int i) { + super.evaluate(outputColVector, inputColVector, i, formatter); + } + + @Override + public VectorExpressionDescriptor.Descriptor getDescriptor() { + VectorExpressionDescriptor.Builder b = new VectorExpressionDescriptor.Builder(); + b.setMode(VectorExpressionDescriptor.Mode.PROJECTION) + .setNumArguments(2) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.STRING_FAMILY, + VectorExpressionDescriptor.ArgumentType.STRING) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN, + VectorExpressionDescriptor.InputExpressionType.SCALAR); + return b.build(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToCharWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToCharWithFormat.java new file mode 100644 index 0000000000..5334200eba --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToCharWithFormat.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO CHAR( WITH FORMAT ). + */ +public class CastTimestampToCharWithFormat extends CastTimestampToChar { + + private static final long serialVersionUID = 1L; + private HiveDateTimeFormatter formatter; + + public CastTimestampToCharWithFormat() { + super(); + } + + public CastTimestampToCharWithFormat(int inputColumn, byte[] patternBytes, int len, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to char with format )," + + " but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), false); + } + + @Override + protected void func(BytesColumnVector outV, TimestampColumnVector inV, int i) { + super.func(outV, inV, i, formatter); + } + + @Override + public String vectorExpressionParameters() { + return super.vectorExpressionParameters() + ", format pattern: " + formatter.getPattern(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToString.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToString.java index adc3a9d7b9..61da01fda0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToString.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToString.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveJavaDateTimeFormatter; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; @@ -42,12 +44,20 @@ PRINT_FORMATTER = builder.toFormatter(); } + private transient HiveDateTimeFormatter format; + public CastTimestampToString() { super(); + initFormatter(); } public CastTimestampToString(int inputColumn, int outputColumnNum) { super(inputColumn, outputColumnNum); + initFormatter(); + } + + private void initFormatter() { + format = new HiveJavaDateTimeFormatter(PRINT_FORMATTER); } // The assign method will be overridden for CHAR and VARCHAR. @@ -55,14 +65,27 @@ protected void assign(BytesColumnVector outV, int i, byte[] bytes, int length) { outV.setVal(i, bytes, 0, length); } + private void assignNull(BytesColumnVector outV, int i) { + outV.isNull[i] = true; + outV.noNulls = false; + } + @Override protected void func(BytesColumnVector outV, TimestampColumnVector inV, int i) { - byte[] temp = LocalDateTime.ofInstant(Instant.ofEpochMilli(inV.time[i]), ZoneOffset.UTC) - .withNano(inV.nanos[i]) - .format(PRINT_FORMATTER).getBytes(); - assign(outV, i, temp, temp.length); + func(outV, inV, i, format); } + protected void func(BytesColumnVector outV, TimestampColumnVector inV, int i, HiveDateTimeFormatter formatter) { + try { + String formattedLocalDateTime = formatter.format( + org.apache.hadoop.hive.common.type.Timestamp.ofEpochMilli(inV.time[i], inV.nanos[i])); + + byte[] temp = formattedLocalDateTime.getBytes(); + assign(outV, i, temp, temp.length); + } catch (Exception e) { + assignNull(outV, i); + } + } public static String getTimestampString(Timestamp ts) { return LocalDateTime.ofInstant(Instant.ofEpochMilli(ts.getTime()), ZoneOffset.UTC) diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToStringWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToStringWithFormat.java new file mode 100644 index 0000000000..774fa6ff50 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToStringWithFormat.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO STRING WITH FORMAT ). + */ +public class CastTimestampToStringWithFormat extends CastTimestampToString { + private static final long serialVersionUID = 1L; + private HiveDateTimeFormatter formatter; + + public CastTimestampToStringWithFormat() { + super(); + } + + public CastTimestampToStringWithFormat(int inputColumn, byte[] patternBytes, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to string with format" + + " ), but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), false); + } + + @Override + protected void func(BytesColumnVector outV, TimestampColumnVector inV, int i) { + super.func(outV, inV, i, formatter); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToVarCharWithFormat.java ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToVarCharWithFormat.java new file mode 100644 index 0000000000..02c30f642d --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastTimestampToVarCharWithFormat.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; + +import java.nio.charset.StandardCharsets; + +/** + * Vectorized UDF for CAST ( TO VARCHAR( WITH FORMAT ). + */ +public class CastTimestampToVarCharWithFormat extends CastTimestampToVarChar { + + private static final long serialVersionUID = 1L; + private HiveDateTimeFormatter formatter; + + public CastTimestampToVarCharWithFormat() { + super(); + } + + public CastTimestampToVarCharWithFormat(int inputColumn, byte[] patternBytes, int len, int outputColumnNum) { + super(inputColumn, outputColumnNum); + + if (patternBytes == null) { + throw new IllegalStateException("Tried to cast ( to varchar with format" + + "), but not found"); + } + formatter = + new HiveSqlDateTimeFormatter(new String(patternBytes, StandardCharsets.UTF_8), false); + } + + @Override + protected void func(BytesColumnVector outV, TimestampColumnVector inV, int i) { + super.func(outV, inV, i, formatter); + } + + @Override + public String vectorExpressionParameters() { + return super.vectorExpressionParameters() + ", format pattern: " + formatter.getPattern(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g index 58fe0cd32e..013079c3d2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g +++ ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g @@ -247,8 +247,18 @@ castExpression LPAREN expression KW_AS - primitiveType - RPAREN -> ^(TOK_FUNCTION primitiveType expression) + toType=primitiveType + (fmt=KW_FORMAT StringLiteral)? + RPAREN + // simple cast + -> {$fmt == null}? ^(TOK_FUNCTION $toType expression) + + // plain cast ... format: toType is int representing a TOK_* in HiveParser_IdentifiersParser, expression, format pattern + -> {((CommonTree)toType.getTree()).getChild(0) == null}? + ^(TOK_FUNCTION {adaptor.create(Identifier, "cast_format")} NumberLiteral[Integer.toString(((CommonTree)toType.getTree()).token.getType())] expression StringLiteral) + + // cast ... format to type with 4th parameter which is length of CHAR or VARCHAR + -> ^(TOK_FUNCTION {adaptor.create(Identifier, "cast_format")} NumberLiteral[Integer.toString(((CommonTree)toType.getTree()).token.getType())] expression StringLiteral NumberLiteral[((CommonTree)toType.getTree()).getChild(0).getText()]) ; caseExpression diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCastFormat.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCastFormat.java new file mode 100644 index 0000000000..da6a26fcbf --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCastFormat.java @@ -0,0 +1,188 @@ +package org.apache.hadoop.hive.ql.udf.generic; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableDateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableHiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableHiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableTimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.Map; + +/** + * Vector expressions: CastDateToCharWithFormat, CastDateToStringWithFormat, + * CastDateToVarCharWithFormat, CastTimestampToCharWithFormat, + * CastTimestampToStringWithFormat, CastTimestampToVarCharWithFormat + * Could not use @VectorizedExpressions annotation because e.g. CastXToCharWithFormat, + * CastXToStringWithFormat, CastXToVarCharWithFormat would have same description. + */ +@Description(name = "cast_format", + value = "CAST( AS [FORMAT ]) - Converts a datetime value to string or" + + " string-type value to datetime based on the format pattern specified.", + extended = "If format is specified with FORMAT argument then SQL:2016 datetime formats will " + + "be used.\n" + + "Example:\n " + + " > SELECT CAST(\"2018-01-01 4 PM\" AS timestamp FORMAT \"yyyy-mm-dd hh12 AM\");\n" + + " 2018-01-01 16:00:00") +public class GenericUDFCastFormat extends GenericUDF implements Serializable { + + private static final Logger LOG = LoggerFactory.getLogger(GenericUDFCastFormat.class.getName()); + + @VisibleForTesting + protected static final Map OUTPUT_TYPES = ImmutableMap.builder() + .put(HiveParser_IdentifiersParser.TOK_STRING, serdeConstants.STRING_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_VARCHAR, serdeConstants.VARCHAR_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_CHAR, serdeConstants.CHAR_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_TIMESTAMP, serdeConstants.TIMESTAMP_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_DATE, serdeConstants.DATE_TYPE_NAME).build(); + + private transient ObjectInspectorConverters.ConverterWithFormatOption converter; + + public GenericUDFCastFormat() { + } + + /** + * @param arguments + * 0. const int, value of a HiveParser_IdentifiersParser constant which represents a TOK_[TYPE] + * 1. expression to convert + * 2. constant string, format pattern + * 3. (optional) constant int, output char/varchar length + */ + @Override public ObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + if (arguments.length != 3 && arguments.length != 4) { + throw new UDFArgumentException( + "Function cast_format requires 3 or 4 arguments (int, expression, StringLiteral" + + "[, var/char length]), got " + arguments.length); + } + + PrimitiveObjectInspector outputOI = getOutputOI(arguments); + PrimitiveObjectInspector inputOI; + try { + inputOI = (PrimitiveObjectInspector) arguments[1]; + } catch (ClassCastException e) { + throw new UDFArgumentException( + "Function CAST...as ... FORMAT ...takes only primitive types"); + } + PrimitiveObjectInspectorUtils.PrimitiveGrouping inputPG = + PrimitiveObjectInspectorUtils.getPrimitiveGrouping(inputOI.getPrimitiveCategory()); + PrimitiveObjectInspectorUtils.PrimitiveGrouping outputPG = + PrimitiveObjectInspectorUtils.getPrimitiveGrouping(outputOI.getPrimitiveCategory()); + + if (inputOI.getPrimitiveCategory() + == PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMPLOCALTZ) { + throw new UDFArgumentException( + "Timestamp with local time zone not yet supported for cast ... format function"); + } + if (!(inputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP + && outputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.DATE_GROUP + || inputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.DATE_GROUP + && outputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP + || inputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.VOID_GROUP)) { + throw new UDFArgumentException( + "Function CAST...as ... FORMAT ... only converts datetime objects to string types" + + " and string or void objects to datetime types. Type of object provided: " + + outputOI.getPrimitiveCategory() + " in primitive grouping " + inputPG + + ", type provided: " + inputOI.getPrimitiveCategory() + " in primitive grouping " + + outputPG); + } + + boolean forParsing = (outputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.DATE_GROUP); + converter = getConverter(inputOI, outputOI); + if (converter == null) { + throw new UDFArgumentException("Function Function CAST...as ... FORMAT ... couldn't create " + + "converter from inputOI " + inputOI + " and outputOI " + outputOI); + } + converter.setDateTimeFormatter( + new HiveSqlDateTimeFormatter(getConstantStringValue(arguments, 2), forParsing)); + + return outputOI; + } + + private PrimitiveObjectInspector getOutputOI(ObjectInspector[] arguments) + throws UDFArgumentException { + int key = getConstantIntValue(arguments, 0); + if (!OUTPUT_TYPES.keySet().contains(key)) { + throw new UDFArgumentException("Cast...format can only convert to DATE, TIMESTAMP, STRING," + + "VARCHAR, CHAR. Can't convert to HiveParser_IdentifiersParser constant with value " + + key); + } + String typeString = OUTPUT_TYPES.get(key); + if (serdeConstants.VARCHAR_TYPE_NAME.equals(typeString) + || serdeConstants.CHAR_TYPE_NAME.equals(typeString)) { + if (arguments.length < 4 || arguments[3] == null) { + throw new UDFArgumentException(typeString + " missing length argument"); + } + typeString += "(" + getConstantIntValue(arguments, 3) + ")"; + } + PrimitiveTypeInfo typeInfo = TypeInfoFactory.getPrimitiveTypeInfo(typeString); + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(typeInfo); + } + + private ObjectInspectorConverters.ConverterWithFormatOption getConverter( + PrimitiveObjectInspector inputOI, PrimitiveObjectInspector outputOI) { + switch (outputOI.getPrimitiveCategory()) { + case STRING: + return new PrimitiveObjectInspectorConverter.TextConverter(inputOI); + case CHAR: + return new PrimitiveObjectInspectorConverter.HiveCharConverter(inputOI, + (SettableHiveCharObjectInspector) outputOI); + case VARCHAR: + return new PrimitiveObjectInspectorConverter.HiveVarcharConverter(inputOI, + (SettableHiveVarcharObjectInspector) outputOI); + case TIMESTAMP: + return new PrimitiveObjectInspectorConverter.TimestampConverter(inputOI, + (SettableTimestampObjectInspector) outputOI); + case DATE: + return new PrimitiveObjectInspectorConverter.DateConverter(inputOI, + (SettableDateObjectInspector) outputOI); + } + return null; + } + + @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object o0 = arguments[1].get(); + if (o0 == null) { + return null; + } + return converter.convert(o0); + } + + @Override public String getDisplayString(String[] children) { + assert children.length == 3 || children.length == 4; + StringBuilder sb = new StringBuilder(); + sb.append("CAST( "); + sb.append(children[1]); + sb.append(" AS "); + int typeKey = Integer.parseInt(children[0]); + if (!OUTPUT_TYPES.keySet().contains(typeKey)) { + sb.append("HiveParsers_IdentifiersParser index ").append(typeKey); + } else { + sb.append(OUTPUT_TYPES.get(typeKey)); + if (children.length == 4) { + sb.append("(").append(children[3]).append(")"); + } + } + sb.append(" FORMAT "); + sb.append(children[2]); + sb.append(" )"); + return sb.toString(); + } +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateSub.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateSub.java index bcc4114099..6c3c3349bb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateSub.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateSub.java @@ -17,8 +17,6 @@ */ package org.apache.hadoop.hive.ql.udf.generic; -import java.text.SimpleDateFormat; - import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDateSubColCol; @@ -46,7 +44,6 @@ + " '2009-07-29'") @VectorizedExpressions({VectorUDFDateSubColScalar.class, VectorUDFDateSubScalarCol.class, VectorUDFDateSubColCol.class}) public class GenericUDFDateSub extends GenericUDFDateAdd { - private transient SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); public GenericUDFDateSub() { this.signModifier = -1; diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorMathFunctions.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorMathFunctions.java index 663237739e..092d320ecd 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorMathFunctions.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorMathFunctions.java @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.Random; +import org.apache.hadoop.hive.common.type.Date; import org.junit.Assert; import org.apache.hadoop.hive.serde2.RandomTypeUtil; @@ -115,18 +116,20 @@ public void testRoundToDecimalPlaces() throws HiveException { Assert.assertEquals(1.2346d, resultV.vector[7], Double.MIN_VALUE); } - static int DAYS_LIMIT = 365 * 9999; + private static final int DAYS_LIMIT = 365 * 9999; + //approximate, so we get some negative values: + private static final int SMALLEST_EPOCH_DAY = -365 * 1969; public static VectorizedRowBatch getVectorizedRowBatchDateInTimestampOut(int[] intValues) { Random r = new Random(12099); VectorizedRowBatch batch = new VectorizedRowBatch(2); LongColumnVector inV; TimestampColumnVector outV; - inV = new LongColumnVector(); - outV = new TimestampColumnVector(); + inV = new LongColumnVector(intValues.length); + outV = new TimestampColumnVector(intValues.length); for (int i = 0; i < intValues.length; i++) { - intValues[i] = r.nextInt() % DAYS_LIMIT; + intValues[i] = SMALLEST_EPOCH_DAY + r.nextInt() % DAYS_LIMIT; inV.vector[i] = intValues[i]; } @@ -137,6 +140,36 @@ public static VectorizedRowBatch getVectorizedRowBatchDateInTimestampOut(int[] i return batch; } + public static VectorizedRowBatch getVectorizedRowBatchDateInStringOut(int[] intValues) { + // get date in timestamp out, and change timestamp out to string out + VectorizedRowBatch batch = getVectorizedRowBatchDateInTimestampOut(intValues); + BytesColumnVector outV = new BytesColumnVector(intValues.length); + batch.cols[1] = outV; + return batch; + } + + // For testing CastDateToStringWithFormat with + // TestVectorTypeCastsWithFormat#testCastDateToStringWithFormat + public static VectorizedRowBatch getVectorizedRowBatchDateInStringOutFormatted() { + VectorizedRowBatch batch = new VectorizedRowBatch(2); + LongColumnVector dateColumnV; + BytesColumnVector stringColumnV; + dateColumnV = new LongColumnVector(); + stringColumnV = new BytesColumnVector(); + + dateColumnV.vector[0] = Date.valueOf("2019-12-31").toEpochDay(); + dateColumnV.vector[1] = Date.valueOf("1776-07-04").toEpochDay(); + dateColumnV.vector[2] = Date.valueOf("2012-02-29").toEpochDay(); + dateColumnV.vector[3] = Date.valueOf("1580-08-08").toEpochDay(); + dateColumnV.vector[4] = Date.valueOf("0005-01-01").toEpochDay(); + dateColumnV.vector[5] = Date.valueOf("9999-12-31").toEpochDay(); + + batch.cols[0] = dateColumnV; + batch.cols[1] = stringColumnV; + batch.size = 6; + return batch; + } + public static VectorizedRowBatch getVectorizedRowBatchDoubleInLongOut() { VectorizedRowBatch batch = new VectorizedRowBatch(2); LongColumnVector lcv; @@ -277,6 +310,42 @@ public static VectorizedRowBatch getVectorizedRowBatchStringInLongOut() { return batch; } + public static VectorizedRowBatch getVectorizedRowBatchStringInTimestampOutFormatted() { + VectorizedRowBatch batch = new VectorizedRowBatch(2); + BytesColumnVector inV; + inV = new BytesColumnVector(); + inV.initBuffer(); + inV.setVal(0, StandardCharsets.UTF_8.encode("2019-12-31 00:00:00.999999999").array()); + inV.setVal(1, StandardCharsets.UTF_8.encode("1776-07-04 17:07:06.177617761").array()); + inV.setVal(2, StandardCharsets.UTF_8.encode("2012-02-29 23:59:59.999999999").array()); + inV.setVal(3, StandardCharsets.UTF_8.encode("1580-08-08 00:00:00.0").array()); + inV.setVal(4, StandardCharsets.UTF_8.encode("0005-01-01 00:00:00.0").array()); + inV.setVal(5, StandardCharsets.UTF_8.encode("9999-12-31 23:59:59.999999999").array()); + + batch.cols[0] = inV; + + batch.size = 6; + return batch; + } + + public static VectorizedRowBatch getVectorizedRowBatchStringInDateOutFormatted() { + VectorizedRowBatch batch = new VectorizedRowBatch(2); + BytesColumnVector inV; + inV = new BytesColumnVector(); + inV.initBuffer(); + inV.setVal(0, StandardCharsets.UTF_8.encode("19/12/31").array()); + inV.setVal(1, StandardCharsets.UTF_8.encode("1776--07--04").array()); + inV.setVal(2, StandardCharsets.UTF_8.encode("2012/02/29").array()); + inV.setVal(3, StandardCharsets.UTF_8.encode("1580/08/08").array()); + inV.setVal(4, StandardCharsets.UTF_8.encode("0005/01/01").array()); + inV.setVal(5, StandardCharsets.UTF_8.encode("9999/12/31").array()); + + batch.cols[0] = inV; + + batch.size = 6; + return batch; + } + public static VectorizedRowBatch getVectorizedRowBatchTimestampInLongOut(long[] longValues) { Random r = new Random(345); VectorizedRowBatch batch = new VectorizedRowBatch(2); @@ -297,6 +366,55 @@ public static VectorizedRowBatch getVectorizedRowBatchTimestampInLongOut(long[] return batch; } + + public static VectorizedRowBatch getVectorizedRowBatchTimestampInStringOut( + long[] epochSecondValues, int[] nanoValues) { + Random r = new Random(345); + VectorizedRowBatch batch = new VectorizedRowBatch(2); + batch.size = epochSecondValues.length; + + TimestampColumnVector inV; + BytesColumnVector outV; + inV = new TimestampColumnVector(batch.size); + outV = new BytesColumnVector(batch.size); + + for (int i = 0; i < batch.size; i++) { + Timestamp randTimestamp = RandomTypeUtil.getRandTimestamp(r); + epochSecondValues[i] = randTimestamp.toEpochSecond(); + nanoValues[i] = randTimestamp.getNanos(); + inV.set(i, randTimestamp.toSqlTimestamp()); + } + + batch.cols[0] = inV; + batch.cols[1] = outV; + + return batch; + } + + public static VectorizedRowBatch getVectorizedRowBatchTimestampInStringOutFormatted() { + VectorizedRowBatch batch = new VectorizedRowBatch(2); + TimestampColumnVector timestampColumnV; + BytesColumnVector stringColumnV; + timestampColumnV = new TimestampColumnVector(); + stringColumnV = new BytesColumnVector(); + + timestampColumnV.set(0, getSqlTimestamp("2019-12-31 19:20:21.999999999")); + timestampColumnV.set(1, getSqlTimestamp("1776-07-04 17:07:06.177617761")); + timestampColumnV.set(2, getSqlTimestamp("2012-02-29 23:59:59.999999999")); + timestampColumnV.set(3, getSqlTimestamp("1580-08-08 00:00:00")); + timestampColumnV.set(4, getSqlTimestamp("0005-01-01 00:00:00")); + timestampColumnV.set(5, getSqlTimestamp("9999-12-31 23:59:59.999999999")); + + batch.cols[0] = timestampColumnV; + batch.cols[1] = stringColumnV; + batch.size = 6; + return batch; + } + + private static java.sql.Timestamp getSqlTimestamp(String s) { + return Timestamp.valueOf(s).toSqlTimestamp(); + } + static long SECONDS_LIMIT = 60L * 24L * 365L * 9999L; public static VectorizedRowBatch getVectorizedRowBatchLongInTimestampOut(long[] longValues) { diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java index 58fd7b030e..a449ea143d 100644 --- ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java @@ -23,7 +23,9 @@ import static org.junit.Assert.assertTrue; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.sql.Timestamp; +import java.util.Arrays; import java.util.Random; import java.util.concurrent.TimeUnit; @@ -72,6 +74,30 @@ public void testVectorCastDoubleToLong() throws HiveException { Assert.assertEquals(1, resultV.vector[6]); } + // +8 hours from PST to GMT, needed because java.sql.Date will subtract 8 hours from final + // value because VM in test time zone is PST. + private static final long TIME_DIFFERENCE = 28800000L; + @Test + public void testCastDateToString() throws HiveException { + int[] intValues = new int[100]; + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchDateInStringOut(intValues); + BytesColumnVector resultV = (BytesColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastDateToString(0, 1); + expr.evaluate(b); + + String expected, result; + for (int i = 0; i < intValues.length; i++) { + expected = + new java.sql.Date(DateWritableV2.daysToMillis(intValues[i]) + TIME_DIFFERENCE).toString(); + byte[] subbyte = Arrays.copyOfRange(resultV.vector[i], resultV.start[i], + resultV.start[i] + resultV.length[i]); + result = new String(subbyte, StandardCharsets.UTF_8); + + Assert.assertEquals("Index: " + i + " Epoch day value: " + intValues[i], expected, result); + } + } + @Test public void testCastDateToTimestamp() throws HiveException { int[] intValues = new int[500]; @@ -192,6 +218,31 @@ public void testCastTimestampToDouble() throws HiveException { } } + @Test + public void testCastTimestampToString() throws HiveException { + int numberToTest = 100; + long[] epochSecondValues = new long[numberToTest]; + int[] nanoValues = new int[numberToTest]; + VectorizedRowBatch b = + TestVectorMathFunctions.getVectorizedRowBatchTimestampInStringOut(epochSecondValues, nanoValues); + BytesColumnVector resultV = (BytesColumnVector) b.cols[1]; + b.cols[0].noNulls = true; + VectorExpression expr = new CastTimestampToString(0, 1); + expr.evaluate(b); + + String expected, result; + for (int i = 0; i < numberToTest; i++) { + expected = org.apache.hadoop.hive.common.type.Timestamp + .ofEpochSecond(epochSecondValues[i], nanoValues[i]).toString(); + byte[] subbyte = Arrays.copyOfRange(resultV.vector[i], resultV.start[i], + resultV.start[i] + resultV.length[i]); + result = new String(subbyte, StandardCharsets.UTF_8); + Assert.assertEquals("Index: " + i + " Seconds since epoch: " + epochSecondValues[i] + + " nanoseconds: " + nanoValues[i], + expected, result); + } + } + public byte[] toBytes(String s) { byte[] b = null; try { diff --git ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCastsWithFormat.java ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCastsWithFormat.java new file mode 100644 index 0000000000..11faa41d50 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCastsWithFormat.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.junit.Assert; +import org.junit.Test; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + +/** + * Tests vectorized type cast udfs CastDateToStringWithFormat, CastTimestampToStringWithFormat, + * CastStringToDateWithFormat, CastStringToTimestampWithFormat. + */ +public class TestVectorTypeCastsWithFormat { + + @Test + public void testCastDateToStringWithFormat() throws HiveException { + VectorizedRowBatch b = TestVectorMathFunctions.getVectorizedRowBatchDateInStringOutFormatted(); + BytesColumnVector resultV = (BytesColumnVector) b.cols[1]; + VectorExpression expr = new CastDateToStringWithFormat(0, "yyyy".getBytes(), 1); + expr.evaluate(b); + verifyString(0, "2019", resultV); + verifyString(1, "1776", resultV); + verifyString(2, "2012", resultV); + verifyString(3, "1580", resultV); + verifyString(4, "0005", resultV); + verifyString(5, "9999", resultV); + + expr = new CastDateToStringWithFormat(0, "MM".getBytes(), 1); + resultV = new BytesColumnVector(); + b.cols[1] = resultV; + expr.evaluate(b); + verifyString(0, "12", resultV); + verifyString(1, "07", resultV); + verifyString(2, "02", resultV); + verifyString(3, "08", resultV); + verifyString(4, "01", resultV); + verifyString(5, "12", resultV); + } + + @Test + public void testCastTimestampToStringWithFormat() throws HiveException { + VectorizedRowBatch b = + TestVectorMathFunctions.getVectorizedRowBatchTimestampInStringOutFormatted(); + BytesColumnVector resultV = (BytesColumnVector) b.cols[1]; + VectorExpression expr = new CastTimestampToStringWithFormat(0, "yyyy".getBytes(), 1); + expr.evaluate(b); + + Assert.assertEquals("2019", getStringFromBytesColumnVector(resultV, 0)); + Assert.assertEquals("1776", getStringFromBytesColumnVector(resultV, 1)); + Assert.assertEquals("2012", getStringFromBytesColumnVector(resultV, 2)); + Assert.assertEquals("1580", getStringFromBytesColumnVector(resultV, 3)); + Assert.assertEquals("0005", getStringFromBytesColumnVector(resultV, 4)); + Assert.assertEquals("9999", getStringFromBytesColumnVector(resultV, 5)); + + resultV = new BytesColumnVector(); + b.cols[1] = resultV; + expr = new CastTimestampToStringWithFormat(0, "HH24".getBytes(), 1); + expr.evaluate(b); + + Assert.assertEquals("19", getStringFromBytesColumnVector(resultV, 0)); + Assert.assertEquals("17", getStringFromBytesColumnVector(resultV, 1)); + Assert.assertEquals("23", getStringFromBytesColumnVector(resultV, 2)); + Assert.assertEquals("00", getStringFromBytesColumnVector(resultV, 3)); + Assert.assertEquals("00", getStringFromBytesColumnVector(resultV, 4)); + Assert.assertEquals("23", getStringFromBytesColumnVector(resultV, 5)); + } + + @Test + public void testCastStringToTimestampWithFormat() throws HiveException { + VectorizedRowBatch b = + TestVectorMathFunctions.getVectorizedRowBatchStringInTimestampOutFormatted(); + TimestampColumnVector resultV; + resultV = new TimestampColumnVector(); + b.cols[1] = resultV; + VectorExpression expr = + new CastStringToTimestampWithFormat(0, "yyyy.mm.dd HH24.mi.ss.ff".getBytes(), 1); + expr.evaluate(b); + + verifyTimestamp("2019-12-31 00:00:00.999999999", resultV, 0); + verifyTimestamp("1776-07-04 17:07:06.177617761", resultV, 1); + verifyTimestamp("2012-02-29 23:59:59.999999999", resultV, 2); + verifyTimestamp("1580-08-08 00:00:00", resultV, 3); + verifyTimestamp("0005-01-01 00:00:00", resultV, 4); + verifyTimestamp("9999-12-31 23:59:59.999999999", resultV, 5); + } + + private void verifyTimestamp(String tsString, TimestampColumnVector resultV, int index) { + Assert.assertEquals(Timestamp.valueOf(tsString).toEpochMilli(), resultV.time[index]); + Assert.assertEquals(Timestamp.valueOf(tsString).getNanos(), resultV.nanos[index]); + } + + @Test + public void testCastStringToDateWithFormat() throws HiveException { + VectorizedRowBatch b = + TestVectorMathFunctions.getVectorizedRowBatchStringInDateOutFormatted(); + LongColumnVector resultV; + resultV = new LongColumnVector(); + b.cols[1] = resultV; + VectorExpression expr = new CastStringToDateWithFormat(0, "yyyy.mm.dd".getBytes(), 1); + expr.evaluate(b); + + Assert.assertEquals(Date.valueOf("2019-12-31").toEpochDay(), resultV.vector[0]); // frogmethod why does this work - it doesn't have the extra char + Assert.assertEquals(Date.valueOf("1776-07-04").toEpochDay(), resultV.vector[1]); + Assert.assertEquals(Date.valueOf("2012-02-29").toEpochDay(), resultV.vector[2]); + Assert.assertEquals(Date.valueOf("1580-08-08").toEpochDay(), resultV.vector[3]); + Assert.assertEquals(Date.valueOf("0005-01-01").toEpochDay(), resultV.vector[4]); + Assert.assertEquals(Date.valueOf("9999-12-31").toEpochDay(), resultV.vector[5]); + } + + private void verifyString(int resultIndex, String expected, BytesColumnVector resultV) { + String result = getStringFromBytesColumnVector(resultV, resultIndex); + Assert.assertEquals(expected, result); + } + + private String getStringFromBytesColumnVector(BytesColumnVector resultV, int i) { + String result; + byte[] resultBytes = Arrays.copyOfRange(resultV.vector[i], resultV.start[i], + resultV.start[i] + resultV.length[i]); + result = new String(resultBytes, StandardCharsets.UTF_8); + return result; + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFCastWithFormat.java ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFCastWithFormat.java new file mode 100644 index 0000000000..ab82b8a8df --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFCastWithFormat.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.junit.Test; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertNull; +import static org.junit.Assert.fail; + +/** + * Tests CAST ( AS STRING/CHAR/VARCHAR FORMAT ) and + * CAST ( AS TIMESTAMP/DATE FORMAT ) + */ +public class TestGenericUDFCastWithFormat { + + //type codes + public static final int CHAR = HiveParser_IdentifiersParser.TOK_CHAR; + public static final int VARCHAR = HiveParser_IdentifiersParser.TOK_VARCHAR; + public static final int STRING = HiveParser_IdentifiersParser.TOK_STRING; + public static final int DATE = HiveParser_IdentifiersParser.TOK_DATE; + public static final int TIMESTAMP = HiveParser_IdentifiersParser.TOK_TIMESTAMP; + + @Test + public void testDateToStringWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.writableDateObjectInspector; + testCast(STRING, inputOI, date("2009-07-30"), "yyyy-MM-dd", "2009-07-30"); + testCast(STRING, inputOI, date("2009-07-30"), "yyyy", "2009"); + testCast(STRING, inputOI, date("1969-07-30"), "dd", "30"); + + testCast(CHAR, 3, inputOI, date("2009-07-30"),"yyyy-MM-dd", "200"); + testCast(CHAR, 3, inputOI, date("2009-07-30"), "yyyy", "200"); + testCast(CHAR, 3, inputOI, date("1969-07-30"), "dd", "30 "); + + testCast(VARCHAR, 3, inputOI, date("2009-07-30"),"yyyy-MM-dd", "200"); + testCast(VARCHAR, 3, inputOI, date("2009-07-30"), "yyyy", "200"); + testCast(VARCHAR, 3, inputOI, date("1969-07-30"), "dd", "30"); + } + + @Test public void testTimestampToStringTypesWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; + testCast(STRING, inputOI, timestamp("2009-07-30 00:00:08"), + "yyyy-MM-dd HH24:mi:ss", "2009-07-30 00:00:08"); + testCast(STRING, inputOI, timestamp("2009-07-30 11:02:00"), + "MM/dd/yyyy hh24miss", "07/30/2009 110200"); + testCast(STRING, inputOI, timestamp("2009-07-30 01:02:03"), "MM", "07"); + testCast(STRING, inputOI, timestamp("1969-07-30 00:00:00"), "yy", "69"); + + testCast(CHAR, 3, inputOI, timestamp("2009-07-30 00:00:08"), + "yyyy-MM-dd HH24:mi:ss", "200"); + testCast(CHAR, 3, inputOI, timestamp("2009-07-30 11:02:00"), + "MM/dd/yyyy hh24miss", "07/"); + testCast(CHAR, 3, inputOI, timestamp("2009-07-30 01:02:03"), "MM", "07 "); + testCast(CHAR, 3, inputOI, timestamp("1969-07-30 00:00:00"), "yy", "69 "); + + testCast(VARCHAR, 3, inputOI, timestamp("2009-07-30 00:00:08"), + "yyyy-MM-dd HH24:mi:ss", "200"); + testCast(VARCHAR, 3, inputOI, timestamp("2009-07-30 11:02:00"), + "MM/dd/yyyy hh24miss", "07/"); + testCast(VARCHAR, 3, inputOI, timestamp("2009-07-30 01:02:03"), "MM", "07"); + testCast(VARCHAR, 3, inputOI, timestamp("1969-07-30 00:00:00"), "yy", "69"); + } + + @Test public void testStringTypesToDateWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + testCast(DATE, inputOI, "2009-07-30", "yyyy-MM-dd", "2009-07-30"); + testCast(DATE, inputOI, "2009", "yyyy", "2009-01-01"); + testCast(DATE, inputOI, "30", "dd", "1970-01-30"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveCharObjectInspector; + testCast(DATE, inputOI, new HiveChar("2009-07-30", 7), "yyyy-MM", "2009-07-01"); + testCast(DATE, inputOI, new HiveChar("2009", 7), "yyyy", "2009-01-01"); + testCast(DATE, inputOI, new HiveChar("30", 7), "dd", "1970-01-30"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveVarcharObjectInspector; + testCast(DATE, inputOI, new HiveVarchar("2009-07-30", 7), "yyyy-MM", "2009-07-01"); + testCast(DATE, inputOI, new HiveVarchar("2009", 7), "yyyy", "2009-01-01"); + testCast(DATE, inputOI, new HiveVarchar("30", 7), "dd", "1970-01-30"); + } + + @Test public void testStringTypesToTimestampWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + testCast(TIMESTAMP, inputOI, "2009-07-30 01:02:03", "yyyy-MM-dd HH24:mi:ss", + "2009-07-30 01:02:03"); + testCast(TIMESTAMP, inputOI, "2009", "yyyy", "2009-01-01 00:00:00"); + testCast(TIMESTAMP, inputOI, "07/30/2009 11:0200", "MM/dd/yyyy hh24:miss", + "2009-07-30 11:02:00"); + testCast(TIMESTAMP, inputOI, "969.07.30.", "yyy.MM.dd.", "2969-07-30 00:00:00"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveCharObjectInspector; + testCast(TIMESTAMP, 13, inputOI, new HiveChar("2009-07-30 01:02:03", 13), "yyyy-MM-dd HH24", + "2009-07-30 01:00:00"); + testCast(TIMESTAMP, 7, inputOI, new HiveChar("2009", 7), "yyyy", "2009-01-01 00:00:00"); + testCast(TIMESTAMP, 18, inputOI, new HiveChar("07/30/2009 11:0200", 18), "MM/dd/yyyy hh24:miss", + "2009-07-30 11:02:00"); + testCast(TIMESTAMP, 7, inputOI, new HiveChar("969.07.30.", 7), "yyy.MM.", + "2969-07-01 00:00:00"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveVarcharObjectInspector; + testCast(TIMESTAMP, 13, inputOI, new HiveVarchar("2009-07-30 01:02:03", 13), "yyyy-MM-dd HH24", + "2009-07-30 01:00:00"); + testCast(TIMESTAMP, 7, inputOI, new HiveVarchar("2009", 7), "yyyy", "2009-01-01 00:00:00"); + testCast(TIMESTAMP, 18, inputOI, new HiveVarchar("07/30/2009 11:0200", 18), + "MM/dd/yyyy hh24:miss", "2009-07-30 11:02:00"); + testCast(TIMESTAMP, 7, inputOI, new HiveVarchar("969.07.30.", 7), "yyy.MM.", + "2969-07-01 00:00:00"); + } + + private TimestampWritableV2 timestamp(String s) { + return new TimestampWritableV2(Timestamp.valueOf(s)); + } + + private DateWritableV2 date(String s) { + return new DateWritableV2(Date.valueOf(s)); + } + + private void testCast(int typeCode, ObjectInspector inputOI, Object input, String format, + String expOutput) throws HiveException { + testCast(typeCode, 0, inputOI, input, format, expOutput); + } + + private void testCast(int typeCode, int length, ObjectInspector inputOI, Object input, String format, + String expOutput) + throws HiveException { + // initialize + GenericUDFCastFormat udf = new GenericUDFCastFormat(); + ConstantObjectInspector typeCodeOI = + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.getPrimitiveTypeInfo("int"), new IntWritable(typeCode)); + ConstantObjectInspector formatOI = + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.getPrimitiveTypeInfo("string"), new Text(format)); + ConstantObjectInspector lengthOI = + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.getPrimitiveTypeInfo("int"), new IntWritable(length)); + ObjectInspector[] initArgs = {typeCodeOI, inputOI, formatOI, lengthOI}; + udf.initialize(initArgs); + + // evaluate + GenericUDF.DeferredObject typeCodeObj = new GenericUDF.DeferredJavaObject(typeCode); + GenericUDF.DeferredObject inputObj = new GenericUDF.DeferredJavaObject(input); + GenericUDF.DeferredObject formatObj = new GenericUDF.DeferredJavaObject(new Text(format)); + GenericUDF.DeferredObject lengthObj = new GenericUDF.DeferredJavaObject(length); + GenericUDF.DeferredObject[] evalArgs = { typeCodeObj, inputObj, formatObj, lengthObj }; + Object output = udf.evaluate(evalArgs); + if (output == null) { + fail( + "Cast " + inputOI.getTypeName() + " \"" + input + "\" to " + GenericUDFCastFormat.OUTPUT_TYPES + .get(typeCode) + " failed, output null"); + } + assertEquals( + "Cast " + inputOI.getTypeName() + " \"" + input + "\" to " + GenericUDFCastFormat.OUTPUT_TYPES.get(typeCode) + + " failed ", expOutput, output.toString()); + + // Try with null input + GenericUDF.DeferredObject[] nullArgs = + { typeCodeObj, new GenericUDF.DeferredJavaObject(null), formatObj, lengthObj }; + assertNull(udf.getFuncName() + " with NULL arguments failed", udf.evaluate(nullArgs)); + } +} diff --git ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q new file mode 100644 index 0000000000..daeb61c975 --- /dev/null +++ ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q @@ -0,0 +1,63 @@ +--non-vectorized +set hive.vectorized.execution.enabled=false; +set hive.fetch.task.conversion=more; + +create table timestamp1 (t timestamp) stored as parquet; +insert into timestamp1 values +("2020-02-03"), +("1969-12-31 23:59:59.999999999") +; +from timestamp1 select cast (t as string format "yyyy hh24...PM ff"); +from timestamp1 select cast (t as char(11) format "yyyy hh24...PM ff"); -- will be truncated +from timestamp1 select cast (t as varchar(11) format "yyyy hh24...PM ff"); -- will be truncated + +create table dates (d date) stored as parquet; +insert into dates values +("2020-02-03"), +("1969-12-31") +; +from dates select cast (d as string format "yyyy mm dd , hh24 mi ss ff9"); +from dates select cast (d as char(10) format "yyyy mm dd , hh24 mi ss ff9"); -- will be truncated +from dates select cast (d as varchar(10) format "yyyy mm dd , hh24 mi ss ff9"); -- will be truncated + +create table strings (s string) stored as parquet; +create table varchars (s varchar(11)) stored as parquet; +create table chars (s char(11)) stored as parquet; +insert into strings values +("20 / 2 / 3"), +("1969 12 31") +; +insert into varchars select * from strings; +insert into chars select * from strings; + +from strings select cast (s as timestamp format "yyyy.mm.dd"); +from strings select cast (s as date format "yyyy.mm.dd"); +from varchars select cast (s as timestamp format "yyyy.mm.dd"); +from varchars select cast (s as date format "yyyy.mm.dd"); +from chars select cast (s as timestamp format "yyyy.mm.dd"); +from chars select cast (s as date format "yyyy.mm.dd"); + + +--correct descriptions +explain from strings select cast (s as timestamp format "yyy.mm.dd"); +explain from strings select cast (s as date format "yyy.mm.dd"); +explain from timestamp1 select cast (t as string format "yyyy"); +explain from timestamp1 select cast (t as varchar(12) format "yyyy"); + + +--vectorized +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +from timestamp1 select cast (t as string format "yyyy"); +from dates select cast (d as string format "yyyy"); +from timestamp1 select cast (t as varchar(11) format "yyyy"); +from dates select cast (d as varchar(11) format "yyyy"); +from timestamp1 select cast (t as char(11) format "yyyy"); +from dates select cast (d as char(11) format "yyyy"); +from strings select cast (s as timestamp format "yyyy.mm.dd"); +from varchars select cast (s as timestamp format "yyyy.mm.dd"); +from chars select cast (s as timestamp format "yyyy.mm.dd"); +from strings select cast (s as date format "yyyy.mm.dd"); +from varchars select cast (s as date format "yyyy.mm.dd"); +from chars select cast (s as date format "yyyy.mm.dd"); diff --git ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out new file mode 100644 index 0000000000..1456466fcb --- /dev/null +++ ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out @@ -0,0 +1,449 @@ +PREHOOK: query: create table timestamp1 (t timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@timestamp1 +POSTHOOK: query: create table timestamp1 (t timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@timestamp1 +PREHOOK: query: insert into timestamp1 values +("2020-02-03"), +("1969-12-31 23:59:59.999999999") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@timestamp1 +POSTHOOK: query: insert into timestamp1 values +("2020-02-03"), +("1969-12-31 23:59:59.999999999") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@timestamp1 +POSTHOOK: Lineage: timestamp1.t SCRIPT [] +PREHOOK: query: from timestamp1 select cast (t as string format "yyyy hh24...PM ff") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: from timestamp1 select cast (t as string format "yyyy hh24...PM ff") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 00...AM 0 +1969 23...PM 999999999 +PREHOOK: query: from timestamp1 select cast (t as char(11) format "yyyy hh24...PM ff") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: from timestamp1 select cast (t as char(11) format "yyyy hh24...PM ff") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 00...A +1969 23...P +PREHOOK: query: -- will be truncated +from timestamp1 select cast (t as varchar(11) format "yyyy hh24...PM ff") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: -- will be truncated +from timestamp1 select cast (t as varchar(11) format "yyyy hh24...PM ff") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 00...A +1969 23...P +PREHOOK: query: -- will be truncated + +create table dates (d date) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dates +POSTHOOK: query: -- will be truncated + +create table dates (d date) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dates +PREHOOK: query: insert into dates values +("2020-02-03"), +("1969-12-31") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@dates +POSTHOOK: query: insert into dates values +("2020-02-03"), +("1969-12-31") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@dates +POSTHOOK: Lineage: dates.d SCRIPT [] +PREHOOK: query: from dates select cast (d as string format "yyyy mm dd , hh24 mi ss ff9") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: from dates select cast (d as string format "yyyy mm dd , hh24 mi ss ff9") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 02 03 , 00 00 00 000000000 +1969 12 31 , 00 00 00 000000000 +PREHOOK: query: from dates select cast (d as char(10) format "yyyy mm dd , hh24 mi ss ff9") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: from dates select cast (d as char(10) format "yyyy mm dd , hh24 mi ss ff9") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 02 03 +1969 12 31 +PREHOOK: query: -- will be truncated +from dates select cast (d as varchar(10) format "yyyy mm dd , hh24 mi ss ff9") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: -- will be truncated +from dates select cast (d as varchar(10) format "yyyy mm dd , hh24 mi ss ff9") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 02 03 +1969 12 31 +PREHOOK: query: -- will be truncated + +create table strings (s string) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@strings +POSTHOOK: query: -- will be truncated + +create table strings (s string) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@strings +PREHOOK: query: create table varchars (s varchar(11)) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@varchars +POSTHOOK: query: create table varchars (s varchar(11)) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@varchars +PREHOOK: query: create table chars (s char(11)) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@chars +POSTHOOK: query: create table chars (s char(11)) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@chars +PREHOOK: query: insert into strings values +("20 / 2 / 3"), +("1969 12 31") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@strings +POSTHOOK: query: insert into strings values +("20 / 2 / 3"), +("1969 12 31") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@strings +POSTHOOK: Lineage: strings.s SCRIPT [] +PREHOOK: query: insert into varchars select * from strings +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +PREHOOK: Output: default@varchars +POSTHOOK: query: insert into varchars select * from strings +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +POSTHOOK: Output: default@varchars +POSTHOOK: Lineage: varchars.s EXPRESSION [(strings)strings.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: insert into chars select * from strings +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +PREHOOK: Output: default@chars +POSTHOOK: query: insert into chars select * from strings +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +POSTHOOK: Output: default@chars +POSTHOOK: Lineage: chars.s EXPRESSION [(strings)strings.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: from strings select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: from strings select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from strings select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: from strings select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: from varchars select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@varchars +#### A masked pattern was here #### +POSTHOOK: query: from varchars select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchars +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from varchars select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@varchars +#### A masked pattern was here #### +POSTHOOK: query: from varchars select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchars +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: from chars select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@chars +#### A masked pattern was here #### +POSTHOOK: query: from chars select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@chars +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from chars select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@chars +#### A masked pattern was here #### +POSTHOOK: query: from chars select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@chars +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: explain from strings select cast (s as timestamp format "yyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: explain from strings select cast (s as timestamp format "yyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: strings + Statistics: Num rows: 2 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( s AS timestamp FORMAT 'yyy.mm.dd' ) (type: timestamp) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain from strings select cast (s as date format "yyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: explain from strings select cast (s as date format "yyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: strings + Statistics: Num rows: 2 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( s AS date FORMAT 'yyy.mm.dd' ) (type: date) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain from timestamp1 select cast (t as string format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: explain from timestamp1 select cast (t as string format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: timestamp1 + Statistics: Num rows: 2 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( t AS string FORMAT 'yyyy' ) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain from timestamp1 select cast (t as varchar(12) format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: explain from timestamp1 select cast (t as varchar(12) format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: timestamp1 + Statistics: Num rows: 2 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( t AS varchar(12) FORMAT 'yyyy' ) (type: varchar(12)) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: from timestamp1 select cast (t as string format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: from timestamp1 select cast (t as string format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 +1969 +PREHOOK: query: from dates select cast (d as string format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: from dates select cast (d as string format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 +1969 +PREHOOK: query: from timestamp1 select cast (t as varchar(11) format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: from timestamp1 select cast (t as varchar(11) format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 +1969 +PREHOOK: query: from dates select cast (d as varchar(11) format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: from dates select cast (d as varchar(11) format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 +1969 +PREHOOK: query: from timestamp1 select cast (t as char(11) format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: from timestamp1 select cast (t as char(11) format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 +1969 +PREHOOK: query: from dates select cast (d as char(11) format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: from dates select cast (d as char(11) format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 +1969 +PREHOOK: query: from strings select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: from strings select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from varchars select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@varchars +#### A masked pattern was here #### +POSTHOOK: query: from varchars select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchars +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from chars select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@chars +#### A masked pattern was here #### +POSTHOOK: query: from chars select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@chars +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from strings select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: from strings select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: from varchars select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@varchars +#### A masked pattern was here #### +POSTHOOK: query: from varchars select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchars +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: from chars select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@chars +#### A masked pattern was here #### +POSTHOOK: query: from chars select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@chars +#### A masked pattern was here #### +2020-02-03 +1969-12-31 diff --git ql/src/test/results/clientpositive/show_functions.q.out ql/src/test/results/clientpositive/show_functions.q.out index 374e9c4fce..84a9243da9 100644 --- ql/src/test/results/clientpositive/show_functions.q.out +++ ql/src/test/results/clientpositive/show_functions.q.out @@ -62,6 +62,7 @@ bucket_number buildversion cardinality_violation case +cast_format cbrt ceil ceiling @@ -349,6 +350,7 @@ POSTHOOK: query: SHOW FUNCTIONS '^c.*' POSTHOOK: type: SHOWFUNCTIONS cardinality_violation case +cast_format cbrt ceil ceiling diff --git serde/src/java/org/apache/hadoop/hive/serde2/io/DateWritableV2.java serde/src/java/org/apache/hadoop/hive/serde2/io/DateWritableV2.java index 4b6a3d6c10..4ff4732324 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/io/DateWritableV2.java +++ serde/src/java/org/apache/hadoop/hive/serde2/io/DateWritableV2.java @@ -21,6 +21,7 @@ import java.io.DataOutput; import java.io.IOException; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableUtils; @@ -147,6 +148,10 @@ public String toString() { return date.toString(); } + public String toStringFormatted(HiveDateTimeFormatter formatter) { + return date.toStringFormatted(formatter); + } + @Override public int hashCode() { return date.toEpochDay(); diff --git serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritableV2.java serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritableV2.java index 9aa7f19ab2..5972bd92b5 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritableV2.java +++ serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritableV2.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.time.format.DateTimeFormatter; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.common.type.TimestampUtils; @@ -387,6 +388,16 @@ public String toString() { return timestamp.format(DATE_TIME_FORMAT); } + public String toStringFormatted(HiveDateTimeFormatter formatter) { + if (formatter == null) { + return toString(); + } + if (timestampEmpty) { + populateTimestamp(); + } + return timestamp.toStringFormatted(formatter); + } + @Override public int hashCode() { long seconds = getSeconds(); diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorConverters.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorConverters.java index 9129177375..b654dc36a3 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorConverters.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorConverters.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.common.classification.InterfaceStability; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaStringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; @@ -45,7 +46,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableTimestampLocalTZObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.VoidObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /** * ObjectInspectorConverters. @@ -61,6 +61,9 @@ public static interface Converter { Object convert(Object input); } + public interface ConverterWithFormatOption extends Converter { + void setDateTimeFormatter(HiveDateTimeFormatter formatter); + } /** * IdentityConverter. diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorConverter.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorConverter.java index 84c027d51c..1ff338ddc4 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorConverter.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorConverter.java @@ -20,6 +20,7 @@ import java.time.ZoneId; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.serde2.lazy.LazyInteger; import org.apache.hadoop.hive.serde2.lazy.LazyLong; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.ConverterWithFormatOption; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TimestampLocalTZTypeInfo; import org.apache.hadoop.io.BytesWritable; @@ -246,10 +248,11 @@ public Object convert(Object input) { } } - public static class DateConverter implements Converter { + public static class DateConverter implements ConverterWithFormatOption { PrimitiveObjectInspector inputOI; SettableDateObjectInspector outputOI; Object r; + private HiveDateTimeFormatter formatter = null; public DateConverter(PrimitiveObjectInspector inputOI, SettableDateObjectInspector outputOI) { @@ -263,15 +266,20 @@ public Object convert(Object input) { return null; } return outputOI.set(r, PrimitiveObjectInspectorUtils.getDate(input, - inputOI)); + inputOI, formatter)); + } + + public void setDateTimeFormatter(HiveDateTimeFormatter formatter) { + this.formatter = formatter; } } - public static class TimestampConverter implements Converter { + public static class TimestampConverter implements ConverterWithFormatOption { PrimitiveObjectInspector inputOI; SettableTimestampObjectInspector outputOI; boolean intToTimestampInSeconds = false; Object r; + private HiveDateTimeFormatter formatter = null; public TimestampConverter(PrimitiveObjectInspector inputOI, SettableTimestampObjectInspector outputOI) { @@ -289,7 +297,11 @@ public Object convert(Object input) { return null; } return outputOI.set(r, PrimitiveObjectInspectorUtils.getTimestamp(input, - inputOI, intToTimestampInSeconds)); + inputOI, intToTimestampInSeconds, formatter)); + } + + public void setDateTimeFormatter(HiveDateTimeFormatter formatter) { + this.formatter = formatter; } } @@ -409,13 +421,14 @@ public Object convert(Object input) { /** * A helper class to convert any primitive to Text. */ - public static class TextConverter implements Converter { + public static class TextConverter implements ConverterWithFormatOption { private final PrimitiveObjectInspector inputOI; private final Text t = new Text(); private final ByteStream.Output out = new ByteStream.Output(); private static byte[] trueBytes = {'T', 'R', 'U', 'E'}; private static byte[] falseBytes = {'F', 'A', 'L', 'S', 'E'}; + private HiveDateTimeFormatter formatter = null; public TextConverter(PrimitiveObjectInspector inputOI) { // The output ObjectInspector is writableStringObjectInspector. @@ -486,11 +499,12 @@ public Text convert(Object input) { } return t; case DATE: - t.set(((DateObjectInspector) inputOI).getPrimitiveWritableObject(input).toString()); + t.set(((DateObjectInspector) inputOI) + .getPrimitiveWritableObject(input).toStringFormatted(formatter)); return t; case TIMESTAMP: t.set(((TimestampObjectInspector) inputOI) - .getPrimitiveWritableObject(input).toString()); + .getPrimitiveWritableObject(input).toStringFormatted(formatter)); return t; case TIMESTAMPLOCALTZ: t.set(((TimestampLocalTZObjectInspector) inputOI).getPrimitiveWritableObject(input).toString()); @@ -520,6 +534,10 @@ public Text convert(Object input) { throw new RuntimeException("Hive 2 Internal error: type = " + inputOI.getTypeName()); } } + + public void setDateTimeFormatter(HiveDateTimeFormatter formatter) { + this.formatter = formatter; + } } /** @@ -540,11 +558,12 @@ public Object convert(Object input) { } - public static class HiveVarcharConverter implements Converter { + public static class HiveVarcharConverter implements ConverterWithFormatOption { PrimitiveObjectInspector inputOI; SettableHiveVarcharObjectInspector outputOI; Object hc; + private HiveDateTimeFormatter formatter; public HiveVarcharConverter(PrimitiveObjectInspector inputOI, SettableHiveVarcharObjectInspector outputOI) { @@ -567,21 +586,26 @@ public Object convert(Object input) { return null; } switch (inputOI.getPrimitiveCategory()) { - case BOOLEAN: - return outputOI.set(hc, - ((BooleanObjectInspector) inputOI).get(input) ? - new HiveVarchar("TRUE", -1) : new HiveVarchar("FALSE", -1)); - default: - return outputOI.set(hc, PrimitiveObjectInspectorUtils.getHiveVarchar(input, inputOI)); + case BOOLEAN: + return outputOI.set(hc, + ((BooleanObjectInspector) inputOI).get(input) ? new HiveVarchar("TRUE", + -1) : new HiveVarchar("FALSE", -1)); + default: + return outputOI + .set(hc, PrimitiveObjectInspectorUtils.getHiveVarchar(input, inputOI, formatter)); } } + public void setDateTimeFormatter(HiveDateTimeFormatter formatter) { + this.formatter = formatter; + } } - public static class HiveCharConverter implements Converter { + public static class HiveCharConverter implements ConverterWithFormatOption { PrimitiveObjectInspector inputOI; SettableHiveCharObjectInspector outputOI; Object hc; + private HiveDateTimeFormatter formatter; public HiveCharConverter(PrimitiveObjectInspector inputOI, SettableHiveCharObjectInspector outputOI) { @@ -601,8 +625,13 @@ public Object convert(Object input) { ((BooleanObjectInspector) inputOI).get(input) ? new HiveChar("TRUE", -1) : new HiveChar("FALSE", -1)); default: - return outputOI.set(hc, PrimitiveObjectInspectorUtils.getHiveChar(input, inputOI)); + return outputOI.set(hc, + PrimitiveObjectInspectorUtils.getHiveChar(input, inputOI, formatter)); } } + + public void setDateTimeFormatter(HiveDateTimeFormatter formatter) { + this.formatter = formatter; + } } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java index 3886b202c7..6cf231e7ae 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.common.classification.InterfaceStability; +import org.apache.hadoop.hive.common.format.datetime.HiveDateTimeFormatter; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -891,12 +892,18 @@ public static float getFloat(Object o, PrimitiveObjectInspector oi) { return (float) getDouble(o, oi); } + public static String getString(Object o, PrimitiveObjectInspector oi) { + return getString(o, oi, null); + } + /** * Get the String value out of a primitive object. Note that * NullPointerException will be thrown if o is null. Note that * RuntimeException will be thrown if o is not a valid string. + * HiveDateTimeFormatter is optional. */ - public static String getString(Object o, PrimitiveObjectInspector oi) { + public static String getString(Object o, PrimitiveObjectInspector oi, + HiveDateTimeFormatter formatter) { if (o == null) { return null; @@ -951,10 +958,12 @@ public static String getString(Object o, PrimitiveObjectInspector oi) { result = hcoi.getPrimitiveJavaObject(o).toString(); break; case DATE: - result = ((DateObjectInspector) oi).getPrimitiveWritableObject(o).toString(); + result = ((DateObjectInspector) oi).getPrimitiveWritableObject(o) + .toStringFormatted(formatter); break; case TIMESTAMP: - result = ((TimestampObjectInspector) oi).getPrimitiveWritableObject(o).toString(); + result = ((TimestampObjectInspector) oi).getPrimitiveWritableObject(o) + .toStringFormatted(formatter); break; case TIMESTAMPLOCALTZ: result = ((TimestampLocalTZObjectInspector) oi).getPrimitiveWritableObject(o).toString(); @@ -978,25 +987,35 @@ public static String getString(Object o, PrimitiveObjectInspector oi) { } public static HiveChar getHiveChar(Object o, PrimitiveObjectInspector oi) { + return getHiveChar(o, oi, null); + } + + public static HiveChar getHiveChar(Object o, PrimitiveObjectInspector oi, + HiveDateTimeFormatter formatter) { if (o == null) { return null; } HiveChar result = null; switch (oi.getPrimitiveCategory()) { - case CHAR: - result = ((HiveCharObjectInspector) oi).getPrimitiveJavaObject(o); - break; - default: - // No char length available, copy whole string value here. - result = new HiveChar(); - result.setValue(getString(o, oi)); - break; + case CHAR: + result = ((HiveCharObjectInspector) oi).getPrimitiveJavaObject(o); + break; + default: + // No char length available, copy whole string value here. + result = new HiveChar(); + result.setValue(getString(o, oi, formatter)); + break; } return result; } public static HiveVarchar getHiveVarchar(Object o, PrimitiveObjectInspector oi) { + return getHiveVarchar(o, oi, null); + } + + public static HiveVarchar getHiveVarchar(Object o, PrimitiveObjectInspector oi, + HiveDateTimeFormatter formatter) { if (o == null) { return null; @@ -1004,16 +1023,16 @@ public static HiveVarchar getHiveVarchar(Object o, PrimitiveObjectInspector oi) HiveVarchar result = null; switch (oi.getPrimitiveCategory()) { - case VARCHAR: - result = ((HiveVarcharObjectInspector)oi).getPrimitiveJavaObject(o); - break; - default: - // Is there a way to provide char length here? - // It might actually be ok as long as there is an object inspector (with char length) - // receiving this value. - result = new HiveVarchar(); - result.setValue(getString(o, oi)); - break; + case VARCHAR: + result = ((HiveVarcharObjectInspector) oi).getPrimitiveJavaObject(o); + break; + default: + // Is there a way to provide char length here? + // It might actually be ok as long as there is an object inspector (with char length) + // receiving this value. + result = new HiveVarchar(); + result.setValue(getString(o, oi, formatter)); + break; } return result; } @@ -1113,6 +1132,11 @@ public static HiveDecimal getHiveDecimal(Object o, PrimitiveObjectInspector oi) } public static Date getDate(Object o, PrimitiveObjectInspector oi) { + return getDate(o, oi, null); + } + + public static Date getDate( + Object o, PrimitiveObjectInspector oi, HiveDateTimeFormatter formatter) { if (o == null) { return null; } @@ -1125,13 +1149,9 @@ public static Date getDate(Object o, PrimitiveObjectInspector oi) { StringObjectInspector soi = (StringObjectInspector) oi; String s = soi.getPrimitiveJavaObject(o).trim(); try { - if (s.length() == DATE_LENGTH) { - result = Date.valueOf(s); - } else { - Timestamp ts = getTimestampFromString(s); - if (ts != null) { - result = Date.ofEpochMilli(ts.toEpochMilli()); - } + Date date = getDateFromString(s, formatter); + if (date != null) { + result = date; } } catch (IllegalArgumentException e) { // Do nothing @@ -1141,13 +1161,9 @@ public static Date getDate(Object o, PrimitiveObjectInspector oi) { case VARCHAR: { String val = getString(o, oi).trim(); try { - if (val.length() == DATE_LENGTH) { - result = Date.valueOf(val); - } else { - Timestamp ts = getTimestampFromString(val); - if (ts != null) { - result = Date.ofEpochMilli(ts.toEpochMilli()); - } + Date date = getDateFromString(val, formatter); + if (date != null) { + result = date; } } catch (IllegalArgumentException e) { // Do nothing @@ -1177,11 +1193,46 @@ public static Date getDate(Object o, PrimitiveObjectInspector oi) { return result; } + private final static int DATE_LENGTH = "YYYY-MM-DD".length(); + private static Date getDateFromString(String s, HiveDateTimeFormatter formatter) { + + // with SQL formats + if (formatter != null) { + try { + return Date.valueOf(s, formatter); + } catch (IllegalArgumentException e) { + return null; + } + } + + // without SQL formats + if (s.length() == DATE_LENGTH) { + return Date.valueOf(s); + } else { + Timestamp ts = getTimestampFromString(s); + if (ts != null) { + return Date.ofEpochMilli(ts.toEpochMilli()); + } + } + return null; + } + public static Timestamp getTimestamp(Object o, PrimitiveObjectInspector oi) { return getTimestamp(o, oi, false); } + public static Timestamp getTimestamp(Object o, PrimitiveObjectInspector oi, HiveDateTimeFormatter formatter) { + return getTimestamp(o, oi, false, formatter); + } + public static Timestamp getTimestamp(Object o, PrimitiveObjectInspector inputOI, boolean intToTimestampInSeconds) { + return getTimestamp(o, inputOI, intToTimestampInSeconds, null); + } + + public static Timestamp getTimestamp(Object o, + PrimitiveObjectInspector inputOI, + boolean intToTimestampInSeconds, + HiveDateTimeFormatter format) { if (o == null) { return null; } @@ -1225,11 +1276,11 @@ public static Timestamp getTimestamp(Object o, PrimitiveObjectInspector inputOI, case STRING: StringObjectInspector soi = (StringObjectInspector) inputOI; String s = soi.getPrimitiveJavaObject(o); - result = getTimestampFromString(s); + result = getTimestampFromString(s, format); break; case CHAR: case VARCHAR: - result = getTimestampFromString(getString(o, inputOI)); + result = getTimestampFromString(getString(o, inputOI), format); break; case DATE: result = Timestamp.ofEpochMilli( @@ -1254,15 +1305,17 @@ public static Timestamp getTimestamp(Object o, PrimitiveObjectInspector inputOI, return result; } - private final static int TS_LENGTH = "yyyy-mm-dd hh:mm:ss".length(); - private final static int DATE_LENGTH = "YYYY-MM-DD".length(); - public static Timestamp getTimestampFromString(String s) { + return getTimestampFromString(s, null); + } + + public static Timestamp getTimestampFromString(String s, HiveDateTimeFormatter formatter) { + s = s.trim(); s = trimNanoTimestamp(s); try { - return TimestampUtils.stringToTimestamp(s); + return TimestampUtils.stringToTimestamp(s, formatter); } catch (IllegalArgumentException e) { return null; } @@ -1284,19 +1337,6 @@ private static String trimNanoTimestamp(String s) { return s; } - private static boolean isValidTimeStamp(final String s) { - if (s.length() == TS_LENGTH || - (s.contains(".") && - s.substring(0, s.indexOf('.')).length() == TS_LENGTH)) { - // Possible timestamp - if (s.charAt(DATE_LENGTH) == '-') { - return false; - } - return true; - } - return false; - } - public static TimestampTZ getTimestampLocalTZ(Object o, PrimitiveObjectInspector oi, ZoneId timeZone) { if (o == null) {