diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java new file mode 100644 index 0000000000..4e024a357b --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java @@ -0,0 +1,885 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.time.DateTimeException; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoField; +import java.time.temporal.ChronoUnit; +import java.time.temporal.TemporalField; +import java.time.temporal.TemporalUnit; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; + +/** + * Formatter using SQL:2016 datetime patterns. + * + * For all tokens: + * - Patterns are case-insensitive, except AM/PM and T/Z. See these sections for more details. + * - For string to datetime conversion, no duplicate format tokens are allowed, including tokens + * that have the same meaning but different lengths ("Y" and "YY" conflict) or different + * behaviors ("RR" and "YY" conflict). + * + * For all numeric tokens: + * - The "expected length" of input/output is the number of tokens in the character (e.g. "YYY": 3, + * "Y": 1, and so on), with some exceptions (see map SPECIAL_LENGTHS). + * - For string to datetime conversion, inputs of fewer digits than expected are accepted if + * followed by a delimiter, e.g. format="YYYY-MM-DD", input="19-1-1", output=2019-01-01 00:00:00. + * - For datetime to string conversion, output is left padded with zeros, e.g. format="DD SSSSS", + * input=2019-01-01 00:00:03, output="01 00003". + * + * + * Accepted format tokens: + * Note: "|" means "or". "Delimiter" means a separator, tokens T or Z, or end of input. + * + * A. Temporal tokens + * YYYY + * 4-digit year + * - For string to datetime conversion, prefix digits for 1, 2, and 3-digit inputs are obtained + * from current date + * E.g. input=‘9-01-01’, pattern =‘YYYY-MM-DD’, current year=2020, output=2029-01-01 00:00:00 + * + * + * YYY + * Last 3 digits of a year + * - Gets the prefix digit from current date. + * - Can accept fewer digits than 3, similarly to YYYY. + * + * YY + * Last 2 digits of a year + * - Gets the 2 prefix digits from current date. + * - Can accept fewer digits than 2, similarly to YYYY. + * + * Y + * Last digit of a year + * - Gets the 3 prefix digits from current date. + * + * RRRR + * 4-digit rounded year + * - String to datetime conversion: + * - If 2 digits are provided then acts like RR. + * - If 1,3 or 4 digits provided then acts like YYYY. + * - For datetime to string conversion, acts like YYYY. + * + * RR + * 2-digit rounded year + * -String to datetime conversion: + * - Semantics: + * Input: Last 2 digits of current year: First 2 digits of output: + * 0 to 49 00 to 49 First 2 digits of current year + * 0 to 49 50 to 99 First 2 digits of current year + 1 + * 50 to 99 00 to 49 First 2 digits of current year - 1 + * 50 to 99 50 to 99 First 2 digits of current year + * - If 1-digit year is provided followed by a delimiter, falls back to YYYY with 1-digit year + * input. + * - For datetime to string conversion, acts like YY. + * + * MM + * Month (1-12) + * - For string to datetime conversion, conflicts with DDD. + * + * DD + * Day of month (1-31) + * - For string to datetime conversion, conflicts with DDD. + * + * DDD + * Day of year (1-366) + * - For string to datetime conversion, conflicts with DD and MM. + * + * HH + * Hour of day (1-12) + * - If no AM/PM provided then defaults to AM. + * - In string to datetime conversion, conflicts with SSSSS and HH24. + * + * HH12 + * Hour of day (1-12) + * See HH. + * + * HH24 + * Hour of day (0-23) + * - In string to datetime conversion, conflicts with SSSSS, HH12 and AM/PM. + * + * MI + * Minute of hour (0-59) + * - In string to datetime conversion, conflicts with SSSSS. + * + * SS + * Second of minute (0-59) + * - In string to datetime conversion, conflicts with SSSSS. + * + * SSSSS + * Second of Day (0-86399) + * - In string to datetime conversion, conflicts with SS, HH, HH12, HH24, MI, AM/PM. + * + * FF[1..9] + * Fraction of second + * - 1..9 indicates the number of decimal digits. "FF" (no number of digits specified) is also + * accepted. + * - In datetime to string conversion, "FF" will omit trailing zeros, or output "0" if subsecond + * value is 0. + * - In string to datetime conversion, fewer digits than expected are accepted if followed by a + * delimiter. "FF" acts like "FF9". + * + * AM|A.M. + * Meridiem indicator or AM/PM + * - Datetime to string conversion: + * - AM and PM mean the exact same thing in the pattern. + * e.g. input=2019-01-01 20:00, format=“AM”, output=“PM”. + * - Retains the exact format (capitalization and length) provided in the pattern string. If p.m. + * is in the pattern, we expect a.m. or p.m. in the output; if AM is in the pattern, we expect + * AM or PM in the output. + * - String to datetime conversion: + * - Conflicts with HH24 and SSSSS. + * - It doesn’t matter which meridian indicator is in the pattern. + * E.g. input="2019-01-01 11:00 p.m.", pattern="YYYY-MM-DD HH12:MI AM", + * output=2019-01-01 23:00:00 + * + * PM|P.M. + * Meridiem indicator + * See AM|A.M. + * + * B. Time zone tokens + * TZH + * Time zone offset hour (-15 to +15) + * - 3-character-long input is expected: 1 character for the sign and 2 digits for the value. + * e.g. “+10”, “-05” + * - 2-digit input is accepted without the sign, e.g. “04”. + * - Both these 2 and 3-digit versions are accepted even if not followed by separators. + * - Disabled for timestamp to string and date to string conversion, as timestamp and date are time + * zone agnostic. + * + * TZM + * Time zone offset minute (0-59) + * - For string to datetime conversion: + * - TZH token is required. + * - Unsigned; sign comes from TZH. + * - Therefore time zone offsets like “-30” minutes should be expressed thus: input=“-00:30” + * pattern=“TZH:TZM”. + * - Disabled for timestamp to string and date to string conversion, as timestamp and date are time + * zone agnostic. + * + * C. Separators + * -|.|/|,|'|;|:| + * Separator + * - Uses loose matching. Existence of a sequence of separators in the format should match the + * existence of a sequence of separators in the input regardless of the types of the separator or + * the length of the sequence where length > 1. E.g. input=“2019-. ;10/10”, pattern=“YYYY-MM-DD” + * is valid; input=“20191010”, pattern=“YYYY-MM-DD” is not valid. + * - If the last separator character in the separator substring is "-" and is immediately followed + * by a time zone hour (tzh) token, it's a negative sign and not counted as a separator, UNLESS + * this is the only possible separator character in the separator substring (in which case it is + * not counted as the tzh's negative sign). + * + * D. ISO 8601 delimiters + * T + * ISO 8601 delimiter + * - Serves as a delimiter. + * - Function is to support formats like “YYYY-MM-DDTHH24:MI:SS.FF9Z”, “YYYY-MM-DD-HH24:MI:SSZ” + * - For datetime to string conversion, output is always capitalized ("T"), even if lowercase ("t") + * is provided in the pattern. + * + * Z + * ISO 8601 delimiter + * See T. + */ + +public class HiveSqlDateTimeFormatter { + + private static final int LONGEST_TOKEN_LENGTH = 5; + private static final int LONGEST_ACCEPTED_PATTERN = 100; // for sanity's sake + private static final long MINUTES_PER_HOUR = 60; + private static final int NANOS_MAX_LENGTH = 9; + public static final int AM = 0; + public static final int PM = 1; + private String pattern; + private List tokens = new ArrayList<>(); + + private static final Map TEMPORAL_TOKENS = + ImmutableMap.builder() + .put("yyyy", ChronoField.YEAR).put("yyy", ChronoField.YEAR) + .put("yy", ChronoField.YEAR).put("y", ChronoField.YEAR) + .put("rrrr", ChronoField.YEAR).put("rr", ChronoField.YEAR) + .put("mm", ChronoField.MONTH_OF_YEAR) + .put("dd", ChronoField.DAY_OF_MONTH) + .put("ddd", ChronoField.DAY_OF_YEAR) + .put("hh", ChronoField.HOUR_OF_AMPM) + .put("hh12", ChronoField.HOUR_OF_AMPM) + .put("hh24", ChronoField.HOUR_OF_DAY) + .put("mi", ChronoField.MINUTE_OF_HOUR) + .put("ss", ChronoField.SECOND_OF_MINUTE) + .put("sssss", ChronoField.SECOND_OF_DAY) + .put("ff1", ChronoField.NANO_OF_SECOND).put("ff2", ChronoField.NANO_OF_SECOND) + .put("ff3", ChronoField.NANO_OF_SECOND).put("ff4", ChronoField.NANO_OF_SECOND) + .put("ff5", ChronoField.NANO_OF_SECOND).put("ff6", ChronoField.NANO_OF_SECOND) + .put("ff7", ChronoField.NANO_OF_SECOND).put("ff8", ChronoField.NANO_OF_SECOND) + .put("ff9", ChronoField.NANO_OF_SECOND).put("ff", ChronoField.NANO_OF_SECOND) + .put("a.m.", ChronoField.AMPM_OF_DAY).put("am", ChronoField.AMPM_OF_DAY) + .put("p.m.", ChronoField.AMPM_OF_DAY).put("pm", ChronoField.AMPM_OF_DAY) + .build(); + + private static final Map TIME_ZONE_TOKENS = + ImmutableMap.builder() + .put("tzh", ChronoUnit.HOURS).put("tzm", ChronoUnit.MINUTES).build(); + + private static final List VALID_ISO_8601_DELIMITERS = + ImmutableList.of("t", "z"); + + private static final List VALID_SEPARATORS = + ImmutableList.of("-", ":", " ", ".", "/", ";", "\'", ","); + + private static final Map SPECIAL_LENGTHS = ImmutableMap.builder() + .put("hh12", 2).put("hh24", 2).put("tzm", 2).put("am", 4).put("pm", 4) + .put("ff1", 1).put("ff2", 2).put("ff3", 3).put("ff4", 4).put("ff5", 5) + .put("ff6", 6).put("ff7", 7).put("ff8", 8).put("ff9", 9).put("ff", 9) + .build(); + + /** + * Represents broad categories of tokens. + */ + public enum TokenType { + TEMPORAL, + SEPARATOR, + TIMEZONE, + ISO_8601_DELIMITER + } + + /** + * Token representation. + */ + public static class Token { + TokenType type; + TemporalField temporalField; // for type TEMPORAL e.g. ChronoField.YEAR + TemporalUnit temporalUnit; // for type TIMEZONE e.g. ChronoUnit.HOURS + String string; // pattern string, e.g. "yyy" + int length; // length (e.g. YYY: 3, FF8: 8) + + public Token(TemporalField temporalField, String string, int length) { + this(TokenType.TEMPORAL, temporalField, null, string, length); + } + + public Token(TemporalUnit temporalUnit, String string, int length) { + this(TokenType.TIMEZONE, null, temporalUnit, string, length); + } + + public Token(TokenType tokenType, String string) { + this(tokenType, null, null, string, string.length()); + } + + public Token(TokenType tokenType, TemporalField temporalField, TemporalUnit temporalUnit, + String string, int length) { + this.type = tokenType; + this.temporalField = temporalField; + this.temporalUnit = temporalUnit; + this.string = string; + this.length = length; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(string); + sb.append(" type: "); + sb.append(type); + if (temporalField != null) { + sb.append(" temporalField: "); + sb.append(temporalField); + } else if (temporalUnit != null) { + sb.append(" temporalUnit: "); + sb.append(temporalUnit); + } + return sb.toString(); + } + } + + public HiveSqlDateTimeFormatter(String pattern, boolean forParsing) { + setPattern(pattern, forParsing); + } + + /** + * Parse and perhaps verify the pattern. + */ + private void setPattern(String pattern, boolean forParsing) { + assert pattern.length() < LONGEST_ACCEPTED_PATTERN : "The input format is too long"; + this.pattern = pattern; + + parsePatternToTokens(pattern); + + // throw IllegalArgumentException if pattern is invalid + if (forParsing) { + verifyForParse(); + } else { + verifyForFormat(); + } + } + + /** + * Parse pattern to list of tokens. + */ + private String parsePatternToTokens(String pattern) { + tokens.clear(); + String originalPattern = pattern; + pattern = pattern.toLowerCase(); + + // indexes of the substring we will check (includes begin, does not include end) + int begin=0, end=0; + String candidate; + Token lastAddedToken = null; + + while (begin < pattern.length()) { + // if begin hasn't progressed, then pattern is not parsable + if (begin != end) { + tokens.clear(); + throw new IllegalArgumentException("Bad date/time conversion pattern: " + pattern); + } + + // find next token + for (int i = LONGEST_TOKEN_LENGTH; i > 0; i--) { + end = begin + i; + if (end > pattern.length()) { // don't go past the end of the pattern string + continue; + } + candidate = pattern.substring(begin, end); + if (isSeparator(candidate)) { + lastAddedToken = parseSeparatorToken(candidate, lastAddedToken); + begin = end; + break; + } + if (isIso8601Delimiter(candidate)) { + lastAddedToken = parseIso8601DelimiterToken(candidate); + begin = end; + break; + } + if (isTemporalToken(candidate)) { + lastAddedToken = parseTemporalToken(originalPattern, begin, candidate); + begin = end; + break; + } + if (isTimeZoneToken(candidate)) { + lastAddedToken = parseTimeZoneToken(candidate); + begin = end; + break; + } + } + } + return pattern; + } + + private boolean isSeparator(String candidate) { + return candidate.length() == 1 && VALID_SEPARATORS.contains(candidate); + } + + private boolean isIso8601Delimiter(String candidate) { + return candidate.length() == 1 && VALID_ISO_8601_DELIMITERS.contains(candidate); + } + + private boolean isTemporalToken(String candidate) { + return TEMPORAL_TOKENS.containsKey(candidate); + } + + private boolean isTimeZoneToken(String pattern) { + return TIME_ZONE_TOKENS.containsKey(pattern); + } + + private Token parseSeparatorToken(String candidate, Token lastAddedToken) { + // try to clump separator with immediately preceding separators (e.g. "---" counts as one + // separator) + if (lastAddedToken != null && lastAddedToken.type == TokenType.SEPARATOR) { + lastAddedToken.string += candidate; + lastAddedToken.length += 1; + } else { + lastAddedToken = new Token(TokenType.SEPARATOR, candidate); + tokens.add(lastAddedToken); + } + return lastAddedToken; + } + + private Token parseIso8601DelimiterToken(String candidate) { + Token lastAddedToken; + lastAddedToken = new Token(TokenType.ISO_8601_DELIMITER, candidate.toUpperCase()); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + private Token parseTemporalToken(String originalPattern, int begin, String candidate) { + Token lastAddedToken; + + // for AM/PM, keep original case + if (TEMPORAL_TOKENS.get(candidate) == ChronoField.AMPM_OF_DAY) { + int subStringEnd = begin + candidate.length(); + candidate = originalPattern.substring(begin, subStringEnd); + } + lastAddedToken = new Token(TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, + getTokenStringLength(candidate.toLowerCase())); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + private Token parseTimeZoneToken(String candidate) { + Token lastAddedToken; + lastAddedToken = new Token(TIME_ZONE_TOKENS.get(candidate), candidate, + getTokenStringLength(candidate)); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + private int getTokenStringLength(String candidate) { + Integer length = SPECIAL_LENGTHS.get(candidate); + if (length != null) { + return length; + } + return candidate.length(); + } + + /** + * Make sure the generated list of tokens is valid for parsing strings to datetime objects. + */ + private void verifyForParse() { + + // create a list of tokens' temporal fields + ArrayList temporalFields = new ArrayList<>(); + ArrayList timeZoneTemporalUnits = new ArrayList<>(); + int roundYearCount=0, yearCount=0; + for (Token token : tokens) { + if (token.temporalField != null) { + temporalFields.add(token.temporalField); + if (token.temporalField == ChronoField.YEAR) { + if (token.string.startsWith("r")) { + roundYearCount += 1; + } else { + yearCount += 1; + } + } + } else if (token.temporalUnit != null) { + timeZoneTemporalUnits.add(token.temporalUnit); + } + } + if (!(temporalFields.contains(ChronoField.YEAR))) { + throw new IllegalArgumentException("Missing year token."); + } + if (!(temporalFields.contains(ChronoField.MONTH_OF_YEAR) && + temporalFields.contains(ChronoField.DAY_OF_MONTH) || + temporalFields.contains(ChronoField.DAY_OF_YEAR))) { + throw new IllegalArgumentException("Missing day of year or (month of year + day of month)" + + " tokens."); + } + if (roundYearCount > 0 && yearCount > 0) { + throw new IllegalArgumentException("Invalid duplication of format element: Both year and" + + "round year are provided"); + } + for (TemporalField tokenType : temporalFields) { + if (Collections.frequency(temporalFields, tokenType) > 1) { + throw new IllegalArgumentException( + "Invalid duplication of format element: multiple " + tokenType.toString() + + " tokens provided."); + } + } + if (temporalFields.contains(ChronoField.AMPM_OF_DAY) && + !(temporalFields.contains(ChronoField.HOUR_OF_DAY) || + temporalFields.contains(ChronoField.HOUR_OF_AMPM))) { + throw new IllegalArgumentException("AM/PM provided but missing hour token."); + } + if (temporalFields.contains(ChronoField.AMPM_OF_DAY) && + temporalFields.contains(ChronoField.HOUR_OF_DAY)) { + throw new IllegalArgumentException("Conflict between median indicator and hour token."); + } + if (temporalFields.contains(ChronoField.HOUR_OF_AMPM) && + temporalFields.contains(ChronoField.HOUR_OF_DAY)) { + throw new IllegalArgumentException("Conflict between hour of day and hour of am/pm token."); + } + if (temporalFields.contains(ChronoField.DAY_OF_YEAR) && + (temporalFields.contains(ChronoField.DAY_OF_MONTH) || + temporalFields.contains(ChronoField.MONTH_OF_YEAR))) { + throw new IllegalArgumentException("Day of year provided with day or month token."); + } + if (temporalFields.contains(ChronoField.SECOND_OF_DAY) && + (temporalFields.contains(ChronoField.HOUR_OF_DAY) || + temporalFields.contains(ChronoField.HOUR_OF_AMPM) || + temporalFields.contains(ChronoField.MINUTE_OF_HOUR) || + temporalFields.contains(ChronoField.SECOND_OF_MINUTE))) { + throw new IllegalArgumentException( + "Second of day token conflicts with other token(s)."); + } + if (timeZoneTemporalUnits.contains(ChronoUnit.MINUTES) && + !timeZoneTemporalUnits.contains(ChronoUnit.HOURS)) { + throw new IllegalArgumentException("Time zone minute token provided without time zone hour token."); + } + } + + /** + * Make sure the generated list of tokens is valid for formatting datetime objects to strings. + */ + private void verifyForFormat() { + for (Token token : tokens) { + if (token.type == TokenType.TIMEZONE) { + throw new IllegalArgumentException(token.string.toUpperCase() + " not a valid format for " + + "timestamp or date."); + } + } + } + + public String format(Timestamp ts) { + StringBuilder fullOutputSb = new StringBuilder(); + String outputString = null; + int value; + LocalDateTime localDateTime = + LocalDateTime.ofEpochSecond(ts.toEpochSecond(), ts.getNanos(), ZoneOffset.UTC); + for (Token token : tokens) { + switch (token.type) { + case TEMPORAL: + try { + value = localDateTime.get(token.temporalField); + outputString = formatTemporal(value, token); + } catch (DateTimeException e) { + throw new IllegalArgumentException(token.temporalField + " couldn't be obtained from " + + "LocalDateTime " + localDateTime, e); + } + break; + case TIMEZONE: //invalid for timestamp and date + throw new IllegalArgumentException(token.string.toUpperCase() + " not a valid format for " + + "timestamp or date."); + case SEPARATOR: + outputString = token.string; + break; + case ISO_8601_DELIMITER: + outputString = token.string.toUpperCase(); + break; + default: + // won't happen + } + fullOutputSb.append(outputString); + } + return fullOutputSb.toString(); + } + + public String format(Date date) { + return format(Timestamp.ofEpochSecond(date.toEpochSecond())); + } + + private String formatTemporal(int value, Token token) { + String output; + if (token.temporalField == ChronoField.AMPM_OF_DAY) { + output = value == 0 ? "a" : "p"; + output += token.string.length() == 2 ? "m" : ".m."; + if (token.string.startsWith("A") || token.string.startsWith("P")) { + output = output.toUpperCase(); + } + } else { // it's a numeric value + + if (token.temporalField == ChronoField.HOUR_OF_AMPM && value == 0) { + value = 12; + } + try { + output = String.valueOf(value); + output = padOrTruncateNumericTemporal(token, output); + } catch (Exception e) { + throw new IllegalArgumentException("Value: " + value + " couldn't be cast to string.", e); + } + } + return output; + } + + /** + * To match token.length, pad left with zeroes or truncate. + */ + private String padOrTruncateNumericTemporal(Token token, String output) { + if (output.length() < token.length) { + output = StringUtils.leftPad(output, token.length, '0'); // pad left + } else if (output.length() > token.length) { + if (token.temporalField == ChronoField.NANO_OF_SECOND) { + output = output.substring(0, token.length); // truncate right + } else { + output = output.substring(output.length() - token.length); // truncate left + } + } + if (token.temporalField == ChronoField.NANO_OF_SECOND + && token.string.equalsIgnoreCase("ff")) { + output = output.replaceAll("0*$", ""); //truncate trailing 0's + if (output.isEmpty()) { + output = "0"; + } + } + return output; + } + + /** + * Left here for timestamp with local time zone. + */ + private String formatTimeZone(TimeZone timeZone, LocalDateTime localDateTime, Token token) { + ZoneOffset offset = timeZone.toZoneId().getRules().getOffset(localDateTime); + Duration seconds = Duration.of(offset.get(ChronoField.OFFSET_SECONDS), ChronoUnit.SECONDS); + if (token.string.equals("tzh")) { + long hours = seconds.toHours(); + String s = (hours >= 0) ? "+" : "-"; + s += (Math.abs(hours) < 10) ? "0" : ""; + s += String.valueOf(Math.abs(hours)); + return s; + } else { + long minutes = Math.abs(seconds.toMinutes() % MINUTES_PER_HOUR); + String s = String.valueOf(minutes); + if (s.length() == 1) { + s = "0" + s; + } + return s; + } + } + + public Timestamp parseTimestamp(String fullInput){ + LocalDateTime ldt = LocalDateTime.ofInstant(Instant.EPOCH, ZoneOffset.UTC); + String substring; + int index = 0; + int value; + int timeZoneSign = 0, timeZoneHours = 0, timeZoneMinutes = 0; + + for (Token token : tokens) { + switch (token.type) { + case TEMPORAL: + substring = getNextSubstring(fullInput, index, token); // e.g. yy-m -> yy + value = parseTemporal(substring, token); // e.g. 18->2018, July->07 + try { + ldt = ldt.with(token.temporalField, value); + } catch (DateTimeException e){ + throw new IllegalArgumentException( + "Value " + value + " not valid for token " + token.toString()); + } + index += substring.length(); + break; + case TIMEZONE: + if (token.temporalUnit == ChronoUnit.HOURS) { + String nextCharacter = fullInput.substring(index, index + 1); + timeZoneSign = "-".equals(nextCharacter) ? -1 : 1; + if ("-".equals(nextCharacter) || "+".equals(nextCharacter)) { + index++; + } + // parse next two digits + substring = getNextSubstring(fullInput, index, index + 2, token); + try { + timeZoneHours = Integer.parseInt(substring); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" with token " + token + " to int. Pattern is " + pattern, e); + } + if (timeZoneHours < -15 || timeZoneHours > 15) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" to TZH because TZH range is -15 to +15. Pattern is " + pattern); + } + } else { // time zone minutes + substring = getNextSubstring(fullInput, index, token); + try { + timeZoneMinutes = Integer.parseInt(substring); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" with token " + token + " to int. Pattern is " + pattern, e); + } + if (timeZoneMinutes < 0 || timeZoneMinutes > 59) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" to TZM because TZM range is 0 to 59. Pattern is " + pattern); + } + } + index += substring.length(); + break; + case SEPARATOR: + index = parseSeparator(fullInput, index, token); + break; + case ISO_8601_DELIMITER: + index = parseIso8601Delimiter(fullInput, index, token); + default: + //do nothing + } + } + + // anything left unparsed at end of string? throw error + if (!fullInput.substring(index).isEmpty()) { + throw new IllegalArgumentException("Leftover input after parsing: " + + fullInput.substring(index) + " in string " + fullInput); + } + + return Timestamp.ofEpochSecond(ldt.toEpochSecond(ZoneOffset.UTC), ldt.getNano()); + } + + public Date parseDate(String input){ + return Date.ofEpochMilli(parseTimestamp(input).toEpochMilli()); + } + /** + * Return the next substring to parse. Length is either specified or token.length, but a + * separator or an ISO-8601 delimiter can cut the substring short. (e.g. if the token pattern is + * "YYYY" we expect the next 4 characters to be 4 numbers. However, if it is "976/" then we + * return "976" because a separator cuts it short.) + */ + private String getNextSubstring(String s, int begin, Token token) { + return getNextSubstring(s, begin, begin + token.length, token); + } + + private String getNextSubstring(String s, int begin, int end, Token token) { + if (end > s.length()) { + end = s.length(); + } + s = s.substring(begin, end); + if (token.temporalField == ChronoField.AMPM_OF_DAY) { + if (s.charAt(1) == 'm' || s.charAt(1) == 'M') { // length 2 + return s.substring(0, 2); + } else { + return s; + } + } + for (String sep : VALID_SEPARATORS) { + if (s.contains(sep)) { + s = s.substring(0, s.indexOf(sep)); + } + } + // TODO this will cause problems with DAY (for example, Thursday starts with T) + for (String delimiter : VALID_ISO_8601_DELIMITERS) { + if (s.toLowerCase().contains(delimiter)) { + s = s.substring(0, s.toLowerCase().indexOf(delimiter)); + } + } + + return s; + } + + /** + * Get the integer value of a temporal substring. + */ + private int parseTemporal(String substring, Token token){ + // exceptions to the rule + if (token.temporalField == ChronoField.AMPM_OF_DAY) { + return substring.toLowerCase().startsWith("a") ? AM : PM; + + } else if (token.temporalField == ChronoField.HOUR_OF_AMPM && "12".equals(substring)) { + substring = "0"; + + } else if (token.temporalField == ChronoField.YEAR) { + String currentYearString = String.valueOf(LocalDateTime.now().getYear()); + //deal with round years + if (token.string.startsWith("r") && substring.length() == 2) { + int currFirst2Digits = Integer.parseInt(currentYearString.substring(0, 2)); + int currLast2Digits = Integer.parseInt(currentYearString.substring(2)); + int valLast2Digits = Integer.parseInt(substring); + if (valLast2Digits < 50 && currLast2Digits >= 50) { + currFirst2Digits += 1; + } else if (valLast2Digits >= 50 && currLast2Digits < 50) { + currFirst2Digits -= 1; + } + substring = String.valueOf(currFirst2Digits) + substring; + } else { // fill in prefix digits with current date + substring = currentYearString.substring(0, 4 - substring.length()) + substring; + } + + } else if (token.temporalField == ChronoField.NANO_OF_SECOND) { + int i = Integer.min(token.length, substring.length()); + substring += StringUtils.repeat("0", NANOS_MAX_LENGTH - i); + } + + // the rule + try { + return Integer.parseInt(substring); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Couldn't parse substring \"" + substring + + "\" with token " + token + " to integer. Pattern is " + pattern, e); + } + } + + /** + * Parse the next separator(s). At least one separator character is expected. Separator + * characters are interchangeable. + * + * Caveat: If the last separator character in the separator substring is "-" and is immediately + * followed by a time zone hour (tzh) token, it's a negative sign and not counted as a + * separator, UNLESS this is the only separator character in the separator substring (in + * which case it is not counted as the negative sign). + * + * @throws IllegalArgumentException if separator is missing + */ + private int parseSeparator(String fullInput, int index, Token token){ + int separatorsFound = 0; + int begin = index; + + while (index < fullInput.length() && + VALID_SEPARATORS.contains(fullInput.substring(index, index + 1))) { + if (!isLastCharacterOfSeparator(index, fullInput) + || !("-".equals(fullInput.substring(index, index + 1)) && (nextTokenIs("tzh", token))) + || separatorsFound == 0) { + separatorsFound++; + } + index++; + } + + if (separatorsFound == 0) { + throw new IllegalArgumentException("Missing separator at index " + index); + } + return begin + separatorsFound; + } + + private int parseIso8601Delimiter(String fullInput, int index, Token token) { + String substring; + substring = fullInput.substring(index, index + 1); + if (token.string.equalsIgnoreCase(substring)) { + index++; + } else { + throw new IllegalArgumentException( + "Missing ISO 8601 delimiter " + token.string.toUpperCase()); + } + return index; + } + + /** + * Is the next character something other than a separator? + */ + private boolean isLastCharacterOfSeparator(int index, String string) { + if (index == string.length() - 1) { // if we're at the end of the string, yes + return true; + } + return !VALID_SEPARATORS.contains(string.substring(index + 1, index + 2)); + } + + /** + * Does the temporalUnit/temporalField of the next token match the pattern's? + */ + private boolean nextTokenIs(String pattern, Token currentToken) { + // make sure currentToken isn't the last one + if (tokens.indexOf(currentToken) == tokens.size() - 1) { + return false; + } + Token nextToken = tokens.get(tokens.indexOf(currentToken) + 1); + pattern = pattern.toLowerCase(); + return (isTimeZoneToken(pattern) && TIME_ZONE_TOKENS.get(pattern) == nextToken.temporalUnit + || isTemporalToken(pattern) && TEMPORAL_TOKENS.get(pattern) == nextToken.temporalField); + } + + public String getPattern() { + return pattern; + } + + /** + * @return a copy of token list + */ + protected List getTokens() { + return new ArrayList<>(tokens); + } +} diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/package-info.java common/src/java/org/apache/hadoop/hive/common/format/datetime/package-info.java new file mode 100644 index 0000000000..1e838be886 --- /dev/null +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Deals with formatting and parsing of datetime objects. + */ +package org.apache.hadoop.hive.common.format.datetime; diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java new file mode 100644 index 0000000000..4e822d53f9 --- /dev/null +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.format.datetime; + +import com.sun.tools.javac.util.List; +import junit.framework.TestCase; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.ResolverStyle; +import java.time.format.SignStyle; +import java.time.temporal.ChronoField; +import java.time.temporal.TemporalField; +import java.util.ArrayList; + +import static java.time.temporal.ChronoField.DAY_OF_MONTH; +import static java.time.temporal.ChronoField.HOUR_OF_DAY; +import static java.time.temporal.ChronoField.MINUTE_OF_HOUR; +import static java.time.temporal.ChronoField.MONTH_OF_YEAR; +import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; +import static java.time.temporal.ChronoField.YEAR; + +/** + * Tests HiveSqlDateTimeFormatter. + */ + +public class TestHiveSqlDateTimeFormatter extends TestCase { + + private HiveSqlDateTimeFormatter formatter; + + public void testSetPattern() { + verifyPatternParsing(" ---yyyy-\'-:- -,.;/MM-dd--", new ArrayList<>(List.of( + null, // represents separator, which has no temporal field + ChronoField.YEAR, + null, + ChronoField.MONTH_OF_YEAR, + null, + ChronoField.DAY_OF_MONTH, + null + ))); + + verifyPatternParsing("ymmdddhh24::mi:ss A.M. pm", 25, "ymmdddhh24::mi:ss A.M. pm", + new ArrayList<>(List.of( + ChronoField.YEAR, + ChronoField.MONTH_OF_YEAR, + ChronoField.DAY_OF_YEAR, + ChronoField.HOUR_OF_DAY, + null, ChronoField.MINUTE_OF_HOUR, + null, ChronoField.SECOND_OF_MINUTE, + null, ChronoField.AMPM_OF_DAY, + null, ChronoField.AMPM_OF_DAY + ))); + } + + public void testSetPatternWithBadPatterns() { + verifyBadPattern("eyyyy-ddd", true); + verifyBadPattern("1yyyy-mm-dd", true); + + //duplicates + verifyBadPattern("yyyy Y", true); + verifyBadPattern("yyyy R", true); + + //missing year or (month + dayofmonth or dayofyear) + verifyBadPattern("yyyy", true); + verifyBadPattern("yyyy-mm", true); + verifyBadPattern("yyyy-dd", true); + verifyBadPattern("mm-dd", true); + verifyBadPattern("ddd", true); + + verifyBadPattern("yyyy-MM-DDD", true); + verifyBadPattern("yyyy-mm-DD DDD", true); + verifyBadPattern("yyyy-mm-dd HH24 HH12", true); + verifyBadPattern("yyyy-mm-dd HH24 AM", true); + verifyBadPattern("yyyy-mm-dd HH24 SSSSS", true); + verifyBadPattern("yyyy-mm-dd HH12 SSSSS", true); + verifyBadPattern("yyyy-mm-dd SSSSS AM", true); + verifyBadPattern("yyyy-mm-dd MI SSSSS", true); + verifyBadPattern("yyyy-mm-dd SS SSSSS", true); + + verifyBadPattern("tzm", false); + verifyBadPattern("tzh", false); + } + + public void testFormatTimestamp() { + checkFormatTs("rr rrrr ddd", "2018-01-03 00:00:00", "18 2018 003"); + checkFormatTs("yyyy-mm-ddtsssss.ff4z", "2018-02-03 00:00:10.777777777", "2018-02-03T00010.7777Z"); + checkFormatTs("hh24:mi:ss.ff1", "2018-02-03 01:02:03.999999999", "01:02:03.9"); + checkFormatTs("y yyy hh:mi:ss.ffz", "2018-02-03 01:02:03.0070070", "8 018 01:02:03.007007Z"); + checkFormatTs("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03 01:02:03.0070070", "am a.m. am a.m. AM A.M. AM A.M."); + checkFormatTs("HH12 P.M.", "2019-01-01 00:15:10", "12 A.M."); + checkFormatTs("HH12 AM", "2019-01-01 12:15:10", "12 PM"); + checkFormatTs("YYYY-MM-DD HH12PM", "2017-05-05 00:00:00", "2017-05-05 12AM"); + } + + private void checkFormatTs(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, false); + assertEquals(expectedOutput, formatter.format(toTimestamp(input))); + } + + public void testFormatDate() { + checkFormatDate("rr rrrr ddd", "2018-01-03", "18 2018 003"); + checkFormatDate("yyyy-mm-ddtsssss.ff4z", "2018-02-03", "2018-02-03T00000.0000Z"); + checkFormatDate("hh24:mi:ss.ff1", "2018-02-03", "00:00:00.0"); + checkFormatDate("y yyy T hh:mi:ss.ff am z", "2018-02-03", "8 018 T 12:00:00.0 am Z"); + checkFormatDate("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03", "am a.m. am a.m. AM A.M. AM A.M."); + checkFormatDate("DDD", "2019-12-31", "365"); + checkFormatDate("DDD", "2020-12-31", "366"); + } + + private void checkFormatDate(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, false); + assertEquals(expectedOutput, formatter.format(toDate(input))); + } + + public void testParseTimestamp() { + String thisYearString = String.valueOf(LocalDateTime.now().getYear()); + int firstTwoDigits = getFirstTwoDigits(); + + //y + checkParseTimestamp("y-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03 00:00:00"); + checkParseTimestamp("yy-mm-dd", "00-02-03", thisYearString.substring(0, 2) + "00-02-03 00:00:00"); + checkParseTimestamp("yyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03 00:00:00"); + checkParseTimestamp("yyyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03 00:00:00"); + + //rr, rrrr + checkParseTimestamp("rr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03 00:00:00"); + checkParseTimestamp("rr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03 00:00:00"); + checkParseTimestamp("rrrr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03 00:00:00"); + + //everything else + checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ff8z", "2018-02-03T04:05:06.5665Z", "2018-02-03 04:05:06.5665"); + checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff", "2018-02-03 04:05:06.555555555", "2018-02-03 04:05:06.555555555"); + checkParseTimestamp("yyyy-mm-dd hh12:mi:ss", "2099-2-03 04:05:06", "2099-02-03 04:05:06"); + checkParseTimestamp("yyyyddd", "2018284", "2018-10-11 00:00:00"); + checkParseTimestamp("yyyyddd", "20184", "2018-01-04 00:00:00"); + checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ffz", "2018-02-03t04:05:06.444Z", "2018-02-03 04:05:06.444"); + checkParseTimestamp("yyyy-mm-dd hh:mi:ss A.M.", "2018-02-03 04:05:06 P.M.", "2018-02-03 16:05:06"); + checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00--1:-30", "2019-01-01 14:00:00"); + checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00-1:30", "2019-01-01 14:00:00"); + checkParseTimestamp("yyyy-mm-dd TZM:TZH", "2019-01-01 1 -3", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy-mm-dd TZH:TZM", "2019-01-01 -0:30", "2019-01-01 00:00:00"); + checkParseTimestamp("TZM/YYY-MM-TZH/DD", "0/333-01-11/02", "2333-01-02 00:00:00"); + checkParseTimestamp("YYYY-MM-DD HH12:MI AM", "2019-01-01 11:00 p.m.", "2019-01-01 23:00:00"); + checkParseTimestamp("YYYY-MM-DD HH12:MI A.M..", "2019-01-01 11:00 pm.", "2019-01-01 23:00:00"); + checkParseTimestamp("MI DD-TZM-YYYY-MM TZHPM SS:HH12.FF9", + "59 03-30-2017-05 01PM 01:08.123456789", "2017-05-03 20:59:01.123456789"); + checkParseTimestamp("YYYYDDMMHH12MISSFFAMTZHTZM", + "20170501123159123456789AM-0130", "2017-01-05 00:31:59.123456789"); + checkParseTimestamp("YYYY-MM-DD AMHH12", "2017-05-06 P.M.12", "2017-05-06 12:00:00"); + checkParseTimestamp("YYYY-MM-DD HH12PM", "2017-05-05 12AM", "2017-05-05 00:00:00"); + checkParseTimestamp("YYYY-MM-DD HH12:MI:SS.FF9PM TZH:TZM", + "2017-05-03 08:59:01.123456789PM 01:30", "2017-05-03 20:59:01.123456789"); + checkParseTimestamp("YYYYDDMMHH12MISSFFAMTZHTZM", + "20170501120159123456789AM-0130", "2017-01-05 00:01:59.123456789"); + + //Test "day in year" token in a leap year scenario + checkParseTimestamp("YYYY DDD", "2000 60", "2000-02-29 00:00:00"); + checkParseTimestamp("YYYY DDD", "2000 61", "2000-03-01 00:00:00"); + checkParseTimestamp("YYYY DDD", "2000 366", "2000-12-31 00:00:00"); + //Test timezone offset parsing without separators + checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM+0515", "2018-12-31 08:00:00"); + checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM0515", "2018-12-31 08:00:00"); + checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM-0515", "2018-12-31 08:00:00"); + } + + private int getFirstTwoDigits() { + int thisYear = LocalDateTime.now().getYear(); + int firstTwoDigits = thisYear / 100; + if (thisYear % 100 < 50) { + firstTwoDigits -= 1; + } + return firstTwoDigits; + } + + private void checkParseTimestamp(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, true); + assertEquals(toTimestamp(expectedOutput), formatter.parseTimestamp(input)); + } + + public void testParseDate() { + + String thisYearString = String.valueOf(LocalDateTime.now().getYear()); + int firstTwoDigits = getFirstTwoDigits(); + //y + checkParseDate("y-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03"); + checkParseDate("yy-mm-dd", "00-02-03", thisYearString.substring(0, 2) + "00-02-03"); + checkParseDate("yyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03"); + checkParseDate("yyyy-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03"); + checkParseDate("rr-mm-dd", "0-02-03", thisYearString.substring(0, 3) + "0-02-03"); + checkParseDate("rrrr-mm-dd", "000-02-03", thisYearString.substring(0, 1) + "000-02-03"); + + //rr, rrrr + checkParseDate("rr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03"); + checkParseDate("rr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03"); + checkParseDate("rr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03"); + checkParseDate("rr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03"); + checkParseDate("rrrr-mm-dd", "00-02-03", firstTwoDigits + 1 + "00-02-03"); + checkParseDate("rrrr-mm-dd", "49-02-03", firstTwoDigits + 1 + "49-02-03"); + checkParseDate("rrrr-mm-dd", "50-02-03", firstTwoDigits + "50-02-03"); + checkParseDate("rrrr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03"); + + checkParseDate("yyyy-mm-dd hh mi ss.ff7", "2018/01/01 2.2.2.55", "2018-01-01"); + } + + private void checkParseDate(String pattern, String input, String expectedOutput) { + formatter = new HiveSqlDateTimeFormatter(pattern, true); + assertEquals(toDate(expectedOutput), formatter.parseDate(input)); + } + + public void testParseTimestampError() { + verifyBadParseString("yyyy", "2019-02-03"); + verifyBadParseString("yyyy-mm-dd ", "2019-02-03"); //separator missing + verifyBadParseString("yyyy-mm-dd", "2019-02-03..."); //extra separators + verifyBadParseString("yyyy-mm-dd hh12:mi:ss", "2019-02-03 14:00:00"); //hh12 out of range + verifyBadParseString("yyyy-dddsssss", "2019-912345"); + verifyBadParseString("yyyy-mm-dd", "2019-13-23"); //mm out of range + verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +16:00"); //tzh out of range + verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +14:60"); //tzm out of range + verifyBadParseString("YYYY DDD", "2000 367"); //ddd out of range + } + + private void verifyBadPattern(String string, boolean forParsing) { + try { + formatter = new HiveSqlDateTimeFormatter(string, forParsing); + fail(); + } catch (Exception e) { + assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + } + } + + /** + * Verify pattern is parsed correctly. + * Check: + * -token.temporalField for each token + * -sum of token.lengths + * -concatenation of token.strings + */ + private void verifyPatternParsing(String pattern, ArrayList temporalFields) { + verifyPatternParsing(pattern, pattern.length(), pattern.toLowerCase(), temporalFields); + } + + private void verifyPatternParsing(String pattern, int expectedPatternLength, + String expectedPattern, ArrayList temporalFields) { + formatter = new HiveSqlDateTimeFormatter(pattern, false); + assertEquals(temporalFields.size(), formatter.getTokens().size()); + StringBuilder sb = new StringBuilder(); + int actualPatternLength = 0; + for (int i = 0; i < temporalFields.size(); i++) { + assertEquals("Generated list of tokens not correct", temporalFields.get(i), + formatter.getTokens().get(i).temporalField); + sb.append(formatter.getTokens().get(i).string); + actualPatternLength += formatter.getTokens().get(i).length; + } + assertEquals("Token strings concatenated don't match original pattern string", + expectedPattern, sb.toString()); + assertEquals(expectedPatternLength, actualPatternLength); + } + + private void verifyBadParseString(String pattern, String string) { + try { + formatter = new HiveSqlDateTimeFormatter(pattern, true); + formatter.parseTimestamp(string); + fail(); + } catch (Exception e) { + assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + } + } + + + // Methods that construct datetime objects using java.time.DateTimeFormatter. + + public static Date toDate(String s) { + LocalDate localDate = LocalDate.parse(s, DATE_FORMATTER); + return Date.ofEpochDay((int) localDate.toEpochDay()); + } + + /** + * This is effectively the old Timestamp.valueOf method. + */ + public static Timestamp toTimestamp(String s) { + LocalDateTime localDateTime = LocalDateTime.parse(s.trim(), TIMESTAMP_FORMATTER); + return Timestamp.ofEpochSecond( + localDateTime.toEpochSecond(ZoneOffset.UTC), localDateTime.getNano()); + } + + private static final DateTimeFormatter DATE_FORMATTER = + DateTimeFormatter.ofPattern("yyyy-MM-dd"); + private static final DateTimeFormatter TIMESTAMP_FORMATTER; + static { + DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); + builder.appendValue(YEAR, 1, 10, SignStyle.NORMAL).appendLiteral('-') + .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NORMAL).appendLiteral('-') + .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NORMAL) + .optionalStart().appendLiteral(" ") + .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NORMAL).appendLiteral(':') + .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NORMAL).appendLiteral(':') + .appendValue(SECOND_OF_MINUTE, 1, 2, SignStyle.NORMAL) + .optionalStart().appendFraction(ChronoField.NANO_OF_SECOND, 1, 9, true).optionalEnd() + .optionalEnd(); + TIMESTAMP_FORMATTER = builder.toFormatter().withResolverStyle(ResolverStyle.LENIENT); + } +} diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/package-info.java common/src/test/org/apache/hadoop/hive/common/format/datetime/package-info.java new file mode 100644 index 0000000000..70ee4266f4 --- /dev/null +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests formatting and parsing of datetime objects. + */ +package org.apache.hadoop.hive.common.format.datetime; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index d08b05fb68..c09db9af65 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -528,6 +528,7 @@ system.registerGenericUDF("to_epoch_milli", GenericUDFEpochMilli.class); system.registerGenericUDF("bucket_number", GenericUDFBucketNumber.class); system.registerGenericUDF("tumbling_window", GenericUDFTumbledWindow.class); + system.registerGenericUDF("cast_format", GenericUDFCastFormat.class); // Generic UDTF's system.registerGenericUDTF("explode", GenericUDTFExplode.class); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g index 58fe0cd32e..013079c3d2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g +++ ql/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g @@ -247,8 +247,18 @@ castExpression LPAREN expression KW_AS - primitiveType - RPAREN -> ^(TOK_FUNCTION primitiveType expression) + toType=primitiveType + (fmt=KW_FORMAT StringLiteral)? + RPAREN + // simple cast + -> {$fmt == null}? ^(TOK_FUNCTION $toType expression) + + // plain cast ... format: toType is int representing a TOK_* in HiveParser_IdentifiersParser, expression, format pattern + -> {((CommonTree)toType.getTree()).getChild(0) == null}? + ^(TOK_FUNCTION {adaptor.create(Identifier, "cast_format")} NumberLiteral[Integer.toString(((CommonTree)toType.getTree()).token.getType())] expression StringLiteral) + + // cast ... format to type with 4th parameter which is length of CHAR or VARCHAR + -> ^(TOK_FUNCTION {adaptor.create(Identifier, "cast_format")} NumberLiteral[Integer.toString(((CommonTree)toType.getTree()).token.getType())] expression StringLiteral NumberLiteral[((CommonTree)toType.getTree()).getChild(0).getText()]) ; caseExpression diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCastFormat.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCastFormat.java new file mode 100644 index 0000000000..16742eee9b --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCastFormat.java @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.hive.common.format.datetime.HiveSqlDateTimeFormatter; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableDateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableHiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableHiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.SettableTimestampObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.Map; + +/** + * CAST( AS FORMAT ). + * + * Vector expressions: CastDateToCharWithFormat, CastDateToStringWithFormat, + * CastDateToVarCharWithFormat, CastTimestampToCharWithFormat, + * CastTimestampToStringWithFormat, CastTimestampToVarCharWithFormat. + * Could not use @VectorizedExpressions annotation because e.g. CastXToCharWithFormat, + * CastXToStringWithFormat, CastXToVarCharWithFormat would have same description. + */ +@Description(name = "cast_format", + value = "CAST( AS FORMAT ) - Converts a datetime value to string or" + + " string-type value to datetime based on the format pattern specified.", + extended = "If format is specified with FORMAT argument then SQL:2016 datetime formats will " + + "be used.\n" + + "Example:\n " + + " > SELECT CAST(\"2018-01-01 4 PM\" AS timestamp FORMAT \"yyyy-mm-dd hh12 AM\");\n" + + " 2018-01-01 16:00:00") +public class GenericUDFCastFormat extends GenericUDF implements Serializable { + + private static final Logger LOG = LoggerFactory.getLogger(GenericUDFCastFormat.class.getName()); + + @VisibleForTesting + static final Map OUTPUT_TYPES = ImmutableMap.builder() + .put(HiveParser_IdentifiersParser.TOK_STRING, serdeConstants.STRING_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_VARCHAR, serdeConstants.VARCHAR_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_CHAR, serdeConstants.CHAR_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_TIMESTAMP, serdeConstants.TIMESTAMP_TYPE_NAME) + .put(HiveParser_IdentifiersParser.TOK_DATE, serdeConstants.DATE_TYPE_NAME).build(); + + private transient HiveSqlDateTimeFormatter formatter; + private transient PrimitiveObjectInspector outputOI; + private transient PrimitiveObjectInspector inputOI; + + public GenericUDFCastFormat() { + } + + /** + * @param arguments + * 0. const int, value of a HiveParser_IdentifiersParser constant which represents a TOK_[TYPE] + * 1. expression to convert + * 2. constant string, format pattern + * 3. (optional) constant int, output char/varchar length + */ + @Override public ObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + if (arguments.length != 3 && arguments.length != 4) { + throw new UDFArgumentException( + "Function cast_format requires 3 or 4 arguments (int, expression, StringLiteral" + + "[, var/char length]), got " + arguments.length); + } + + outputOI = getOutputOI(arguments); + try { + inputOI = (PrimitiveObjectInspector) arguments[1]; + } catch (ClassCastException e) { + throw new UDFArgumentException( + "Function CAST...as ... FORMAT ...takes only primitive types"); + } + PrimitiveObjectInspectorUtils.PrimitiveGrouping inputPG = + PrimitiveObjectInspectorUtils.getPrimitiveGrouping(inputOI.getPrimitiveCategory()); + PrimitiveObjectInspectorUtils.PrimitiveGrouping outputPG = + PrimitiveObjectInspectorUtils.getPrimitiveGrouping(outputOI.getPrimitiveCategory()); + + if (inputOI.getPrimitiveCategory() + == PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMPLOCALTZ) { + throw new UDFArgumentException( + "Timestamp with local time zone not yet supported for cast ... format function"); + } + if (!(inputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP + && outputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.DATE_GROUP + || inputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.DATE_GROUP + && outputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP + || inputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.VOID_GROUP)) { + throw new UDFArgumentException( + "Function CAST...as ... FORMAT ... only converts datetime objects to string types" + + " and string or void objects to datetime types. Type of object provided: " + + outputOI.getPrimitiveCategory() + " in primitive grouping " + inputPG + + ", type provided: " + inputOI.getPrimitiveCategory() + " in primitive grouping " + + outputPG); + } + + boolean forParsing = (outputPG == PrimitiveObjectInspectorUtils.PrimitiveGrouping.DATE_GROUP); + formatter = new HiveSqlDateTimeFormatter(getConstantStringValue(arguments, 2), forParsing); + return outputOI; + } + + private PrimitiveObjectInspector getOutputOI(ObjectInspector[] arguments) + throws UDFArgumentException { + int key = getConstantIntValue(arguments, 0); + if (!OUTPUT_TYPES.keySet().contains(key)) { + throw new UDFArgumentException("Cast...format can only convert to DATE, TIMESTAMP, STRING," + + "VARCHAR, CHAR. Can't convert to HiveParser_IdentifiersParser constant with value " + + key); + } + String typeString = OUTPUT_TYPES.get(key); + if (serdeConstants.VARCHAR_TYPE_NAME.equals(typeString) + || serdeConstants.CHAR_TYPE_NAME.equals(typeString)) { + if (arguments.length < 4 || arguments[3] == null) { + throw new UDFArgumentException(typeString + " missing length argument"); + } + typeString += "(" + getConstantIntValue(arguments, 3) + ")"; + } + PrimitiveTypeInfo typeInfo = TypeInfoFactory.getPrimitiveTypeInfo(typeString); + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(typeInfo); + } + + @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object o0 = arguments[1].get(); + if (o0 == null) { + return null; + } + return convert(o0); + } + + private Object convert(Object o) throws HiveException { + Object input; + switch (inputOI.getPrimitiveCategory()) { + case STRING: + input = ((StringObjectInspector) inputOI).getPrimitiveJavaObject(o); + break; + case CHAR: + input = ((HiveCharObjectInspector) inputOI).getPrimitiveJavaObject(o).getStrippedValue(); + break; + case VARCHAR: + input = ((HiveVarcharObjectInspector) inputOI).getPrimitiveJavaObject(o).toString(); + break; + case TIMESTAMP: + input = ((TimestampObjectInspector) inputOI).getPrimitiveWritableObject(o).getTimestamp(); + break; + case DATE: + input = ((DateObjectInspector) inputOI).getPrimitiveWritableObject(o).get(); + break; + default: + throw new HiveException("Input type " + inputOI.getPrimitiveCategory() + " not valid"); + } + + // format here + Object formattedOutput = null; + if (inputOI.getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.DATE) { + formattedOutput = formatter.format((Date) input); + if (formattedOutput == null) { + return null; + } + } else if (inputOI.getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP) { + formattedOutput = formatter.format((Timestamp) input); + if (formattedOutput == null) { + return null; + } + } + + // parse and create Writables + switch (outputOI.getPrimitiveCategory()) { + case STRING: + return new Text((String) formattedOutput); + case CHAR: + return ((SettableHiveCharObjectInspector) outputOI) + .create(new HiveChar((String) formattedOutput, -1)); + case VARCHAR: + return ((SettableHiveVarcharObjectInspector) outputOI) + .create(new HiveVarchar((String) formattedOutput, -1)); + case TIMESTAMP: + Timestamp t = formatter.parseTimestamp((String) input); + if (t == null) { + return null; + } + return ((SettableTimestampObjectInspector) outputOI).create(t); + case DATE: + Date d = formatter.parseDate((String) input); + if (d == null) { + return null; + } + return ((SettableDateObjectInspector) outputOI).create(d); + default: + throw new HiveException("Output type " + outputOI.getPrimitiveCategory() + " not valid"); + } + } + + @Override public String getDisplayString(String[] children) { + assert children.length == 3 || children.length == 4; + StringBuilder sb = new StringBuilder(); + sb.append("CAST( "); + sb.append(children[1]); + sb.append(" AS "); + int typeKey = Integer.parseInt(children[0]); + if (!OUTPUT_TYPES.keySet().contains(typeKey)) { + sb.append("HiveParsers_IdentifiersParser index ").append(typeKey); + } else { + sb.append(OUTPUT_TYPES.get(typeKey)); + if (children.length == 4) { + sb.append("(").append(children[3]).append(")"); + } + } + sb.append(" FORMAT "); + sb.append(children[2]); + sb.append(" )"); + return sb.toString(); + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFCastFormat.java ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFCastFormat.java new file mode 100644 index 0000000000..9afd5af2be --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFCastFormat.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.junit.Test; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertNull; +import static org.junit.Assert.fail; + +/** + * Tests CAST ( AS STRING/CHAR/VARCHAR FORMAT ) and + * CAST ( AS TIMESTAMP/DATE FORMAT ). + */ +public class TestGenericUDFCastFormat { + + //type codes + public static final int CHAR = HiveParser_IdentifiersParser.TOK_CHAR; + public static final int VARCHAR = HiveParser_IdentifiersParser.TOK_VARCHAR; + public static final int STRING = HiveParser_IdentifiersParser.TOK_STRING; + public static final int DATE = HiveParser_IdentifiersParser.TOK_DATE; + public static final int TIMESTAMP = HiveParser_IdentifiersParser.TOK_TIMESTAMP; + + @Test + public void testDateToStringWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.writableDateObjectInspector; + testCast(STRING, inputOI, date("2009-07-30"), "yyyy-MM-dd", "2009-07-30"); + testCast(STRING, inputOI, date("2009-07-30"), "yyyy", "2009"); + testCast(STRING, inputOI, date("1969-07-30"), "dd", "30"); + + testCast(CHAR, 3, inputOI, date("2009-07-30"), "yyyy-MM-dd", "200"); + testCast(CHAR, 3, inputOI, date("2009-07-30"), "yyyy", "200"); + testCast(CHAR, 3, inputOI, date("1969-07-30"), "dd", "30 "); + + testCast(VARCHAR, 3, inputOI, date("2009-07-30"), "yyyy-MM-dd", "200"); + testCast(VARCHAR, 3, inputOI, date("2009-07-30"), "yyyy", "200"); + testCast(VARCHAR, 3, inputOI, date("1969-07-30"), "dd", "30"); + } + + @Test public void testTimestampToStringTypesWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; + testCast(STRING, inputOI, timestamp("2009-07-30 00:00:08"), + "yyyy-MM-dd HH24:mi:ss", "2009-07-30 00:00:08"); + testCast(STRING, inputOI, timestamp("2009-07-30 11:02:00"), + "MM/dd/yyyy hh24miss", "07/30/2009 110200"); + testCast(STRING, inputOI, timestamp("2009-07-30 01:02:03"), "MM", "07"); + testCast(STRING, inputOI, timestamp("1969-07-30 00:00:00"), "yy", "69"); + + testCast(CHAR, 3, inputOI, timestamp("2009-07-30 00:00:08"), + "yyyy-MM-dd HH24:mi:ss", "200"); + testCast(CHAR, 3, inputOI, timestamp("2009-07-30 11:02:00"), + "MM/dd/yyyy hh24miss", "07/"); + testCast(CHAR, 3, inputOI, timestamp("2009-07-30 01:02:03"), "MM", "07 "); + testCast(CHAR, 3, inputOI, timestamp("1969-07-30 00:00:00"), "yy", "69 "); + + testCast(VARCHAR, 3, inputOI, timestamp("2009-07-30 00:00:08"), + "yyyy-MM-dd HH24:mi:ss", "200"); + testCast(VARCHAR, 3, inputOI, timestamp("2009-07-30 11:02:00"), + "MM/dd/yyyy hh24miss", "07/"); + testCast(VARCHAR, 3, inputOI, timestamp("2009-07-30 01:02:03"), "MM", "07"); + testCast(VARCHAR, 3, inputOI, timestamp("1969-07-30 00:00:00"), "yy", "69"); + } + + @Test public void testStringTypesToDateWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + testCast(DATE, inputOI, "1969-07-30 13:00", "yyyy-MM-dd hh24:mi", "1969-07-30"); + testCast(DATE, inputOI, "307-2009", "ddmm-yyyy", "2009-07-30"); + testCast(DATE, inputOI, "307-2009", "ddd-yyyy", "2009-11-03"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveCharObjectInspector; + testCast(DATE, inputOI, new HiveChar("1969-07-30 13:00", 15), "yyyy-MM-dd hh24:mi", + "1969-07-30"); + testCast(DATE, inputOI, new HiveChar("307-2009", 7), "ddmm-yyyy", "2200-07-30"); + testCast(DATE, inputOI, new HiveChar("307-2009", 7), "ddd-yyyy", "2200-11-03"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveVarcharObjectInspector; + testCast(DATE, inputOI, new HiveVarchar("1969-07-30 13:00", 15), "yyyy-MM-dd hh24:mi", + "1969-07-30"); + testCast(DATE, inputOI, new HiveVarchar("307-2009", 7), "ddmm-yyyy", "2200-07-30"); + testCast(DATE, inputOI, new HiveVarchar("307-2009", 7), "ddd-yyyy", "2200-11-03"); + } + + @Test public void testStringTypesToTimestampWithFormat() throws HiveException { + ObjectInspector inputOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + testCast(TIMESTAMP, inputOI, "2009-07-30 01:02:03", "yyyy-MM-dd HH24:mi:ss", + "2009-07-30 01:02:03"); + testCast(TIMESTAMP, inputOI, "07/30/2009 11:0200", "MM/dd/yyyy hh24:miss", + "2009-07-30 11:02:00"); + testCast(TIMESTAMP, inputOI, "969.07.30.", "yyy.MM.dd.", "2969-07-30 00:00:00"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveCharObjectInspector; + testCast(TIMESTAMP, 13, inputOI, new HiveChar("2009-07-30 01:02:03", 13), "yyyy-MM-dd HH24", + "2009-07-30 01:00:00"); + testCast(TIMESTAMP, 18, inputOI, new HiveChar("07/30/2009 11:0200", 18), "MM/dd/yyyy hh24:miss", + "2009-07-30 11:02:00"); + testCast(TIMESTAMP, 10, inputOI, new HiveChar("969.07.30.12:00", 10), "yyy.MM.dd.", + "2969-07-30 00:00:00"); + + inputOI = PrimitiveObjectInspectorFactory.javaHiveVarcharObjectInspector; + testCast(TIMESTAMP, 13, inputOI, new HiveVarchar("2009-07-30 01:02:03", 13), "yyyy-MM-dd HH24", + "2009-07-30 01:00:00"); + testCast(TIMESTAMP, 18, inputOI, new HiveVarchar("07/30/2009 11:0200", 18), + "MM/dd/yyyy hh24:miss", "2009-07-30 11:02:00"); + testCast(TIMESTAMP, 10, inputOI, new HiveVarchar("969.07.30.12:00", 10), "yyy.MM.dd.", + "2969-07-30 00:00:00"); + } + + private TimestampWritableV2 timestamp(String s) { + return new TimestampWritableV2(Timestamp.valueOf(s)); + } + + private DateWritableV2 date(String s) { + return new DateWritableV2(Date.valueOf(s)); + } + + private void testCast(int typeCode, ObjectInspector inputOI, Object input, String format, + String expOutput) throws HiveException { + testCast(typeCode, 0, inputOI, input, format, expOutput); + } + + private void testCast(int typeCode, int length, ObjectInspector inputOI, Object input, String format, + String expOutput) + throws HiveException { + // initialize + GenericUDFCastFormat udf = new GenericUDFCastFormat(); + ConstantObjectInspector typeCodeOI = + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.getPrimitiveTypeInfo("int"), new IntWritable(typeCode)); + ConstantObjectInspector formatOI = + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.getPrimitiveTypeInfo("string"), new Text(format)); + ConstantObjectInspector lengthOI = + PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + TypeInfoFactory.getPrimitiveTypeInfo("int"), new IntWritable(length)); + ObjectInspector[] initArgs = {typeCodeOI, inputOI, formatOI, lengthOI}; + udf.initialize(initArgs); + + // evaluate + GenericUDF.DeferredObject typeCodeObj = new GenericUDF.DeferredJavaObject(typeCode); + GenericUDF.DeferredObject inputObj = new GenericUDF.DeferredJavaObject(input); + GenericUDF.DeferredObject formatObj = new GenericUDF.DeferredJavaObject(new Text(format)); + GenericUDF.DeferredObject lengthObj = new GenericUDF.DeferredJavaObject(length); + GenericUDF.DeferredObject[] evalArgs = {typeCodeObj, inputObj, formatObj, lengthObj}; + Object output = udf.evaluate(evalArgs); + if (output == null) { + fail( + "Cast " + inputOI.getTypeName() + " \"" + input + "\" to " + GenericUDFCastFormat.OUTPUT_TYPES + .get(typeCode) + " failed, output null"); + } + assertEquals( + "Cast " + inputOI.getTypeName() + " \"" + input + "\" to " + GenericUDFCastFormat.OUTPUT_TYPES.get(typeCode) + + " failed ", expOutput, output.toString()); + + // Try with null input + GenericUDF.DeferredObject[] nullArgs = + {typeCodeObj, new GenericUDF.DeferredJavaObject(null), formatObj, lengthObj}; + assertNull(udf.getFuncName() + " with NULL arguments failed", udf.evaluate(nullArgs)); + } +} diff --git ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q new file mode 100644 index 0000000000..269edf6da6 --- /dev/null +++ ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q @@ -0,0 +1,45 @@ +--non-vectorized +set hive.vectorized.execution.enabled=false; +set hive.fetch.task.conversion=more; + +create table timestamp1 (t timestamp) stored as parquet; +insert into timestamp1 values +("2020-02-03"), +("1969-12-31 23:59:59.999999999") +; +from timestamp1 select cast (t as string format "yyyy hh24...PM ff"); +from timestamp1 select cast (t as char(11) format "yyyy hh24...PM ff"); -- will be truncated +from timestamp1 select cast (t as varchar(11) format "yyyy hh24...PM ff"); -- will be truncated + +create table dates (d date) stored as parquet; +insert into dates values +("2020-02-03"), +("1969-12-31") +; +from dates select cast (d as string format "yyyy mm dd , hh24 mi ss ff9"); +from dates select cast (d as char(10) format "yyyy mm dd , hh24 mi ss ff9"); -- will be truncated +from dates select cast (d as varchar(10) format "yyyy mm dd , hh24 mi ss ff9"); -- will be truncated + +create table strings (s string) stored as parquet; +create table varchars (s varchar(11)) stored as parquet; +create table chars (s char(11)) stored as parquet; +insert into strings values +("20 / 2 / 3"), +("1969 12 31") +; +insert into varchars select * from strings; +insert into chars select * from strings; + +from strings select cast (s as timestamp format "yyyy.mm.dd"); +from strings select cast (s as date format "yyyy.mm.dd"); +from varchars select cast (s as timestamp format "yyyy.mm.dd"); +from varchars select cast (s as date format "yyyy.mm.dd"); +from chars select cast (s as timestamp format "yyyy.mm.dd"); +from chars select cast (s as date format "yyyy.mm.dd"); + + +--correct descriptions +explain from strings select cast (s as timestamp format "yyy.mm.dd"); +explain from strings select cast (s as date format "yyy.mm.dd"); +explain from timestamp1 select cast (t as string format "yyyy"); +explain from timestamp1 select cast (t as varchar(12) format "yyyy"); diff --git ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out new file mode 100644 index 0000000000..4a502b9700 --- /dev/null +++ ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out @@ -0,0 +1,329 @@ +PREHOOK: query: create table timestamp1 (t timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@timestamp1 +POSTHOOK: query: create table timestamp1 (t timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@timestamp1 +PREHOOK: query: insert into timestamp1 values +("2020-02-03"), +("1969-12-31 23:59:59.999999999") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@timestamp1 +POSTHOOK: query: insert into timestamp1 values +("2020-02-03"), +("1969-12-31 23:59:59.999999999") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@timestamp1 +POSTHOOK: Lineage: timestamp1.t SCRIPT [] +PREHOOK: query: from timestamp1 select cast (t as string format "yyyy hh24...PM ff") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: from timestamp1 select cast (t as string format "yyyy hh24...PM ff") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 00...AM 0 +1969 23...PM 999999999 +PREHOOK: query: from timestamp1 select cast (t as char(11) format "yyyy hh24...PM ff") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: from timestamp1 select cast (t as char(11) format "yyyy hh24...PM ff") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 00...A +1969 23...P +PREHOOK: query: -- will be truncated +from timestamp1 select cast (t as varchar(11) format "yyyy hh24...PM ff") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: -- will be truncated +from timestamp1 select cast (t as varchar(11) format "yyyy hh24...PM ff") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +2020 00...A +1969 23...P +PREHOOK: query: -- will be truncated + +create table dates (d date) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dates +POSTHOOK: query: -- will be truncated + +create table dates (d date) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dates +PREHOOK: query: insert into dates values +("2020-02-03"), +("1969-12-31") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@dates +POSTHOOK: query: insert into dates values +("2020-02-03"), +("1969-12-31") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@dates +POSTHOOK: Lineage: dates.d SCRIPT [] +PREHOOK: query: from dates select cast (d as string format "yyyy mm dd , hh24 mi ss ff9") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: from dates select cast (d as string format "yyyy mm dd , hh24 mi ss ff9") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 02 03 , 00 00 00 000000000 +1969 12 31 , 00 00 00 000000000 +PREHOOK: query: from dates select cast (d as char(10) format "yyyy mm dd , hh24 mi ss ff9") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: from dates select cast (d as char(10) format "yyyy mm dd , hh24 mi ss ff9") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 02 03 +1969 12 31 +PREHOOK: query: -- will be truncated +from dates select cast (d as varchar(10) format "yyyy mm dd , hh24 mi ss ff9") +PREHOOK: type: QUERY +PREHOOK: Input: default@dates +#### A masked pattern was here #### +POSTHOOK: query: -- will be truncated +from dates select cast (d as varchar(10) format "yyyy mm dd , hh24 mi ss ff9") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dates +#### A masked pattern was here #### +2020 02 03 +1969 12 31 +PREHOOK: query: -- will be truncated + +create table strings (s string) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@strings +POSTHOOK: query: -- will be truncated + +create table strings (s string) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@strings +PREHOOK: query: create table varchars (s varchar(11)) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@varchars +POSTHOOK: query: create table varchars (s varchar(11)) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@varchars +PREHOOK: query: create table chars (s char(11)) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@chars +POSTHOOK: query: create table chars (s char(11)) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@chars +PREHOOK: query: insert into strings values +("20 / 2 / 3"), +("1969 12 31") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@strings +POSTHOOK: query: insert into strings values +("20 / 2 / 3"), +("1969 12 31") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@strings +POSTHOOK: Lineage: strings.s SCRIPT [] +PREHOOK: query: insert into varchars select * from strings +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +PREHOOK: Output: default@varchars +POSTHOOK: query: insert into varchars select * from strings +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +POSTHOOK: Output: default@varchars +POSTHOOK: Lineage: varchars.s EXPRESSION [(strings)strings.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: insert into chars select * from strings +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +PREHOOK: Output: default@chars +POSTHOOK: query: insert into chars select * from strings +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +POSTHOOK: Output: default@chars +POSTHOOK: Lineage: chars.s EXPRESSION [(strings)strings.FieldSchema(name:s, type:string, comment:null), ] +PREHOOK: query: from strings select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: from strings select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from strings select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: from strings select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: from varchars select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@varchars +#### A masked pattern was here #### +POSTHOOK: query: from varchars select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchars +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from varchars select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@varchars +#### A masked pattern was here #### +POSTHOOK: query: from varchars select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchars +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: from chars select cast (s as timestamp format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@chars +#### A masked pattern was here #### +POSTHOOK: query: from chars select cast (s as timestamp format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@chars +#### A masked pattern was here #### +2020-02-03 00:00:00 +1969-12-31 00:00:00 +PREHOOK: query: from chars select cast (s as date format "yyyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@chars +#### A masked pattern was here #### +POSTHOOK: query: from chars select cast (s as date format "yyyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@chars +#### A masked pattern was here #### +2020-02-03 +1969-12-31 +PREHOOK: query: explain from strings select cast (s as timestamp format "yyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: explain from strings select cast (s as timestamp format "yyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: strings + Statistics: Num rows: 2 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( s AS timestamp FORMAT 'yyy.mm.dd' ) (type: timestamp) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain from strings select cast (s as date format "yyy.mm.dd") +PREHOOK: type: QUERY +PREHOOK: Input: default@strings +#### A masked pattern was here #### +POSTHOOK: query: explain from strings select cast (s as date format "yyy.mm.dd") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@strings +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: strings + Statistics: Num rows: 2 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( s AS date FORMAT 'yyy.mm.dd' ) (type: date) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain from timestamp1 select cast (t as string format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: explain from timestamp1 select cast (t as string format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: timestamp1 + Statistics: Num rows: 2 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( t AS string FORMAT 'yyyy' ) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain from timestamp1 select cast (t as varchar(12) format "yyyy") +PREHOOK: type: QUERY +PREHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +POSTHOOK: query: explain from timestamp1 select cast (t as varchar(12) format "yyyy") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@timestamp1 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: timestamp1 + Statistics: Num rows: 2 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CAST( t AS varchar(12) FORMAT 'yyyy' ) (type: varchar(12)) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + diff --git ql/src/test/results/clientpositive/show_functions.q.out ql/src/test/results/clientpositive/show_functions.q.out index 374e9c4fce..84a9243da9 100644 --- ql/src/test/results/clientpositive/show_functions.q.out +++ ql/src/test/results/clientpositive/show_functions.q.out @@ -62,6 +62,7 @@ bucket_number buildversion cardinality_violation case +cast_format cbrt ceil ceiling @@ -349,6 +350,7 @@ POSTHOOK: query: SHOW FUNCTIONS '^c.*' POSTHOOK: type: SHOWFUNCTIONS cardinality_violation case +cast_format cbrt ceil ceiling