diff --git a/common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java b/common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java index 4e024a357b..f5a648b238 100644 --- a/common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java +++ b/common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java @@ -21,29 +21,35 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang.WordUtils; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.Timestamp; import java.time.DateTimeException; -import java.time.Duration; import java.time.Instant; import java.time.LocalDateTime; +import java.time.Month; import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.TextStyle; import java.time.temporal.ChronoField; import java.time.temporal.ChronoUnit; +import java.time.temporal.IsoFields; import java.time.temporal.TemporalField; import java.time.temporal.TemporalUnit; +import java.time.temporal.WeekFields; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Locale; import java.util.Map; -import java.util.TimeZone; /** * Formatter using SQL:2016 datetime patterns. * * For all tokens: - * - Patterns are case-insensitive, except AM/PM and T/Z. See these sections for more details. + * - Patterns are case-insensitive, except AM/PM and nested strings. See these sections for more + * details. * - For string to datetime conversion, no duplicate format tokens are allowed, including tokens * that have the same meaning but different lengths ("Y" and "YY" conflict) or different * behaviors ("RR" and "YY" conflict). @@ -53,21 +59,25 @@ * "Y": 1, and so on), with some exceptions (see map SPECIAL_LENGTHS). * - For string to datetime conversion, inputs of fewer digits than expected are accepted if * followed by a delimiter, e.g. format="YYYY-MM-DD", input="19-1-1", output=2019-01-01 00:00:00. + * This is modified by format modifier FX (format exact). See FX for details. * - For datetime to string conversion, output is left padded with zeros, e.g. format="DD SSSSS", * input=2019-01-01 00:00:03, output="01 00003". - * + * This is modified by format modifier FM (fill mode). See FM for details. * * Accepted format tokens: - * Note: "|" means "or". "Delimiter" means a separator, tokens T or Z, or end of input. + * Note: - "|" means "or". + * - "Delimiter" means a separator, the beginning of a nested string (must be obvious – e.g. + * if the token in question is numeric, any non-numeric character is a delimiter), or end + * of input. + * - The words token and pattern are used interchangeably. * - * A. Temporal tokens + * A.1. Numeric temporal tokens * YYYY * 4-digit year * - For string to datetime conversion, prefix digits for 1, 2, and 3-digit inputs are obtained * from current date * E.g. input=‘9-01-01’, pattern =‘YYYY-MM-DD’, current year=2020, output=2029-01-01 00:00:00 * - * * YYY * Last 3 digits of a year * - Gets the prefix digit from current date. @@ -104,7 +114,7 @@ * * MM * Month (1-12) - * - For string to datetime conversion, conflicts with DDD. + * - For string to datetime conversion, conflicts with DDD, MONTH, MON. * * DD * Day of month (1-31) @@ -148,23 +158,96 @@ * - In string to datetime conversion, fewer digits than expected are accepted if followed by a * delimiter. "FF" acts like "FF9". * - * AM|A.M. - * Meridiem indicator or AM/PM + * AM|A.M.|PM|P.M. + * Meridiem indicator (or AM/PM) * - Datetime to string conversion: * - AM and PM mean the exact same thing in the pattern. * e.g. input=2019-01-01 20:00, format=“AM”, output=“PM”. * - Retains the exact format (capitalization and length) provided in the pattern string. If p.m. * is in the pattern, we expect a.m. or p.m. in the output; if AM is in the pattern, we expect - * AM or PM in the output. + * AM or PM in the output. If the case is mixed (Am or aM) then the output case will match the + * case of the pattern's first character (Am => AM, aM => am). * - String to datetime conversion: * - Conflicts with HH24 and SSSSS. * - It doesn’t matter which meridian indicator is in the pattern. * E.g. input="2019-01-01 11:00 p.m.", pattern="YYYY-MM-DD HH12:MI AM", * output=2019-01-01 23:00:00 + * - Not listed as a character temporal because of special status: does not get padded with spaces + * upon formatting, and case is handled differently at datetime to string conversion. + * + * D + * Day of week (1-7) + * - 1 means Sunday, 2 means Monday, and so on. + * - Not allowed in string to datetime conversion. + * + * Q + * Quarter of year (1-4) + * - Not allowed in string to datetime conversion. + * + * WW + * Aligned week of year (1-53) + * - 1st week begins on January 1st and ends on January 7th, and so on. + * - Not allowed in string to datetime conversion. + * + * W + * Week of month (1-5) + * - 1st week starts on the 1st of the month and ends on the 7th, and so on. + * - Not allowed in string to datetime conversion. + * + * A.2. Character temporals + * Temporal elements, but spelled out. + * - For datetime to string conversion, the pattern's case must match one of the listed formats + * (e.g. mOnTh is not accepted) to avoid ambiguity. Output is right padded with trailing spaces + * unless the pattern is marked with the fill mode modifier (FM). + * - For string to datetime conversion, the case of the pattern does not matter. * - * PM|P.M. - * Meridiem indicator - * See AM|A.M. + * MONTH|Month|month + * Name of month of year + * - For datetime to string conversion, will include trailing spaces up to length 9 (length of + * longest month of year name: "September"). Case is taken into account according to the + * following example (pattern => output): + * - MONTH => JANUARY + * - Month => January + * - month => january + * - For string to datetime conversion, neither the case of the pattern nor the case of the input + * are taken into account. + * - For string to datetime conversion, conflicts with MM and MON. + * + * + * MON|Mon|mon + * Abbreviated name of month of year + * - For datetime to string conversion, case is taken into account according to the following + * example (pattern => output): + * - MON => JAN + * - Mon => Jan + * - mon => jan + * - For string to datetime conversion, neither the case of the pattern nor the case of the input + * are taken into account. + * - For string to datetime conversion, conflicts with MM and MONTH. + * + * + * DAY|Day|day + * Name of day of week + * - For datetime to string conversion, will include trailing spaces until length is 9 (length of + * longest day of week name: "Wednesday"). Case is taken into account according to the following + * example (pattern => output): + * - DAY = SUNDAY + * - Day = Sunday + * - day = sunday + * - For string to datetime conversion, neither the case of the pattern nor the case of the input + * are taken into account. + * - Not allowed in string to datetime conversion. + * + * DY|Dy|dy + * Abbreviated name of day of week + * - For datetime to string conversion, case is taken into account according to the following + * example (pattern => output): + * - DY = SUN + * - Dy = Sun + * - dy = sun + * - For string to datetime conversion, neither the case of the pattern nor the case of the input + * are taken into account. + * - Not allowed in string to datetime conversion. * * B. Time zone tokens * TZH @@ -197,37 +280,68 @@ * by a time zone hour (tzh) token, it's a negative sign and not counted as a separator, UNLESS * this is the only possible separator character in the separator substring (in which case it is * not counted as the tzh's negative sign). + * - If the whole pattern string is delimited by single quotes (''), then the apostrophe separator + * (') must be escaped with a single backslash: (\'). + * + * D. Nested strings (Text) + * – Surround with double quotes (") in the pattern. Note, if the whole pattern string is delimited + * by double quotes, then the double quotes must be escaped with a single backslash: (\"). + * - In order to include a literal double quote character within the nested string, the double + * quote character must be escaped with a double backslash: (\\”). If the whole pattern string is + * delimited by double quotes, then escape with a triple backslash: (\\\") + * - If the whole pattern string is delimited by single quotes, literal single + * quotes/apostrophes (') in the nested string must be escaped with a single backslash: (\') + * - For datetime to string conversion, we simply include the string in the output, preserving the + * characters' case. + * - For string to datetime conversion, the information is lost as the nested string won’t be part + * of the resulting datetime object. However, the nested string has to match the related part of + * the input string, except case may differ. + * + * E. Format modifier tokens + * FM + * Fill mode modifier + * - Default for string to datetime conversion. Inputs of fewer digits than expected are accepted + * if followed by a delimiter: + * e.g. format="YYYY-MM-DD", input="19-1-1", output=2019-01-01 00:00:00 + * - For datetime to string conversion, padding (trailing spaces for text data and leading zeroes + * for numeric data) is omitted for the temporal element immediately following an "FM" in the + * pattern string. If the element following is not a temporal element (for example, if "FM" + * precedes a separator), an error will be thrown. + * e.g. pattern=FMHH12:MI:FMSS, input=2019-01-01 01:01:01, output=1:01:1 + * - Modifies FX so that lack of leading zeroes are accepted for the element immediately following + * an "FM" in the pattern string. * - * D. ISO 8601 delimiters - * T - * ISO 8601 delimiter - * - Serves as a delimiter. - * - Function is to support formats like “YYYY-MM-DDTHH24:MI:SS.FF9Z”, “YYYY-MM-DD-HH24:MI:SSZ” - * - For datetime to string conversion, output is always capitalized ("T"), even if lowercase ("t") - * is provided in the pattern. - * - * Z - * ISO 8601 delimiter - * See T. + * FX + * Format exact modifier + * - Default for datetime to string conversion. Numeric output is left padded with zeros, and + * non-numeric output except for AM/PM is right padded with spaces up to expected length. + * - Applies to the whole pattern. + * - Rules applied at string to datetime conversion: + * - Separators must match exactly, down to the character. + * - Numeric input can't omit leading zeroes. This rule does not apply to elements (tokens) + * immediately preceded by an "FM." */ public class HiveSqlDateTimeFormatter { private static final int LONGEST_TOKEN_LENGTH = 5; private static final int LONGEST_ACCEPTED_PATTERN = 100; // for sanity's sake - private static final long MINUTES_PER_HOUR = 60; private static final int NANOS_MAX_LENGTH = 9; public static final int AM = 0; public static final int PM = 1; + private static final DateTimeFormatter MONTH_FORMATTER = DateTimeFormatter.ofPattern("MMM"); + public static final DateTimeFormatter DAY_OF_WEEK_FORMATTER = DateTimeFormatter.ofPattern("EEE"); private String pattern; private List tokens = new ArrayList<>(); + private boolean formatExact = false; - private static final Map TEMPORAL_TOKENS = + private static final Map NUMERIC_TEMPORAL_TOKENS = ImmutableMap.builder() .put("yyyy", ChronoField.YEAR).put("yyy", ChronoField.YEAR) .put("yy", ChronoField.YEAR).put("y", ChronoField.YEAR) .put("rrrr", ChronoField.YEAR).put("rr", ChronoField.YEAR) .put("mm", ChronoField.MONTH_OF_YEAR) + .put("d", WeekFields.SUNDAY_START.dayOfWeek()) .put("dd", ChronoField.DAY_OF_MONTH) .put("ddd", ChronoField.DAY_OF_YEAR) .put("hh", ChronoField.HOUR_OF_AMPM) @@ -243,32 +357,44 @@ .put("ff9", ChronoField.NANO_OF_SECOND).put("ff", ChronoField.NANO_OF_SECOND) .put("a.m.", ChronoField.AMPM_OF_DAY).put("am", ChronoField.AMPM_OF_DAY) .put("p.m.", ChronoField.AMPM_OF_DAY).put("pm", ChronoField.AMPM_OF_DAY) + .put("ww", ChronoField.ALIGNED_WEEK_OF_YEAR).put("w", ChronoField.ALIGNED_WEEK_OF_MONTH) + .put("q", IsoFields.QUARTER_OF_YEAR) .build(); + private static final Map CHARACTER_TEMPORAL_TOKENS = + ImmutableMap.builder() + .put("mon", ChronoField.MONTH_OF_YEAR) + .put("month", ChronoField.MONTH_OF_YEAR) + .put("day", WeekFields.SUNDAY_START.dayOfWeek()) + .put("dy", WeekFields.SUNDAY_START.dayOfWeek()) + .build(); + private static final Map TIME_ZONE_TOKENS = ImmutableMap.builder() .put("tzh", ChronoUnit.HOURS).put("tzm", ChronoUnit.MINUTES).build(); - private static final List VALID_ISO_8601_DELIMITERS = - ImmutableList.of("t", "z"); - private static final List VALID_SEPARATORS = ImmutableList.of("-", ":", " ", ".", "/", ";", "\'", ","); + private static final List VALID_FORMAT_MODIFIERS = + ImmutableList.of("fm", "fx"); + private static final Map SPECIAL_LENGTHS = ImmutableMap.builder() .put("hh12", 2).put("hh24", 2).put("tzm", 2).put("am", 4).put("pm", 4) .put("ff1", 1).put("ff2", 2).put("ff3", 3).put("ff4", 4).put("ff5", 5) .put("ff6", 6).put("ff7", 7).put("ff8", 8).put("ff9", 9).put("ff", 9) + .put("month", 9).put("day", 9).put("dy", 3) .build(); /** * Represents broad categories of tokens. */ public enum TokenType { - TEMPORAL, + NUMERIC_TEMPORAL, + CHARACTER_TEMPORAL, SEPARATOR, TIMEZONE, - ISO_8601_DELIMITER + TEXT } /** @@ -276,33 +402,37 @@ */ public static class Token { TokenType type; - TemporalField temporalField; // for type TEMPORAL e.g. ChronoField.YEAR + TemporalField temporalField; // for TEMPORAL types e.g. ChronoField.YEAR TemporalUnit temporalUnit; // for type TIMEZONE e.g. ChronoUnit.HOURS String string; // pattern string, e.g. "yyy" int length; // length (e.g. YYY: 3, FF8: 8) + boolean fillMode; //FM, applies to type TEMPORAL only (later should apply to TIMEZONE as well) + - public Token(TemporalField temporalField, String string, int length) { - this(TokenType.TEMPORAL, temporalField, null, string, length); + public Token(TokenType tokenType, TemporalField temporalField, String string, int length, + boolean fillMode) { + this(tokenType, temporalField, null, string, length, fillMode); } public Token(TemporalUnit temporalUnit, String string, int length) { - this(TokenType.TIMEZONE, null, temporalUnit, string, length); + this(TokenType.TIMEZONE, null, temporalUnit, string, length, false); } public Token(TokenType tokenType, String string) { - this(tokenType, null, null, string, string.length()); + this(tokenType, null, null, string, string.length(), false); } public Token(TokenType tokenType, TemporalField temporalField, TemporalUnit temporalUnit, - String string, int length) { + String string, int length, boolean fillMode) { this.type = tokenType; this.temporalField = temporalField; this.temporalUnit = temporalUnit; this.string = string; this.length = length; + this.fillMode = fillMode; } - public String toString() { + @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(string); sb.append(" type: "); @@ -316,6 +446,11 @@ public String toString() { } return sb.toString(); } + + public void removeBackslashes() { + string = string.replaceAll("\\\\", ""); + length = string.length(); + } } public HiveSqlDateTimeFormatter(String pattern, boolean forParsing) { @@ -342,7 +477,7 @@ private void setPattern(String pattern, boolean forParsing) { /** * Parse pattern to list of tokens. */ - private String parsePatternToTokens(String pattern) { + private void parsePatternToTokens(String pattern) { tokens.clear(); String originalPattern = pattern; pattern = pattern.toLowerCase(); @@ -351,6 +486,7 @@ private String parsePatternToTokens(String pattern) { int begin=0, end=0; String candidate; Token lastAddedToken = null; + boolean fillMode = false; while (begin < pattern.length()) { // if begin hasn't progressed, then pattern is not parsable @@ -367,47 +503,70 @@ private String parsePatternToTokens(String pattern) { } candidate = pattern.substring(begin, end); if (isSeparator(candidate)) { - lastAddedToken = parseSeparatorToken(candidate, lastAddedToken); + lastAddedToken = parseSeparatorToken(candidate, lastAddedToken, fillMode, begin); begin = end; break; } - if (isIso8601Delimiter(candidate)) { - lastAddedToken = parseIso8601DelimiterToken(candidate); + if (isNumericTemporalToken(candidate)) { + lastAddedToken = parseTemporalToken(originalPattern, candidate, fillMode, begin); + fillMode = false; begin = end; break; } - if (isTemporalToken(candidate)) { - lastAddedToken = parseTemporalToken(originalPattern, begin, candidate); + if (isCharacterTemporalToken(candidate)) { + lastAddedToken = parseCharacterTemporalToken(originalPattern, candidate, fillMode, begin); + fillMode = false; begin = end; break; } if (isTimeZoneToken(candidate)) { - lastAddedToken = parseTimeZoneToken(candidate); + lastAddedToken = parseTimeZoneToken(candidate, fillMode, begin); + begin = end; + break; + } + if (isTextToken(candidate)) { + lastAddedToken = parseTextToken(originalPattern, fillMode, begin); + end = begin + lastAddedToken.length + 2; // 2 quotation marks + begin = end; + lastAddedToken.removeBackslashes(); + break; + } + if (isFormatModifierToken(candidate)) { + fillMode = setOrGetFormatModifier(candidate, fillMode, begin); begin = end; break; } } } - return pattern; } private boolean isSeparator(String candidate) { return candidate.length() == 1 && VALID_SEPARATORS.contains(candidate); } - private boolean isIso8601Delimiter(String candidate) { - return candidate.length() == 1 && VALID_ISO_8601_DELIMITERS.contains(candidate); + private boolean isNumericTemporalToken(String candidate) { + return NUMERIC_TEMPORAL_TOKENS.containsKey(candidate); } - private boolean isTemporalToken(String candidate) { - return TEMPORAL_TOKENS.containsKey(candidate); + private boolean isCharacterTemporalToken(String candidate) { + return CHARACTER_TEMPORAL_TOKENS.containsKey(candidate); } private boolean isTimeZoneToken(String pattern) { return TIME_ZONE_TOKENS.containsKey(pattern); } - private Token parseSeparatorToken(String candidate, Token lastAddedToken) { + private boolean isTextToken(String candidate) { + return candidate.startsWith("\""); + } + + private boolean isFormatModifierToken(String candidate) { + return candidate.length() == 2 && VALID_FORMAT_MODIFIERS.contains(candidate); + } + + private Token parseSeparatorToken(String candidate, Token lastAddedToken, boolean fillMode, + int begin) { + checkFillModeOff(fillMode, begin); // try to clump separator with immediately preceding separators (e.g. "---" counts as one // separator) if (lastAddedToken != null && lastAddedToken.type == TokenType.SEPARATOR) { @@ -420,37 +579,79 @@ private Token parseSeparatorToken(String candidate, Token lastAddedToken) { return lastAddedToken; } - private Token parseIso8601DelimiterToken(String candidate) { - Token lastAddedToken; - lastAddedToken = new Token(TokenType.ISO_8601_DELIMITER, candidate.toUpperCase()); + private Token parseTemporalToken(String originalPattern, String candidate, boolean fillMode, + int begin) { + // for AM/PM, keep original case + if (NUMERIC_TEMPORAL_TOKENS.get(candidate) == ChronoField.AMPM_OF_DAY) { + int subStringEnd = begin + candidate.length(); + candidate = originalPattern.substring(begin, subStringEnd); + } + Token lastAddedToken = new Token(TokenType.NUMERIC_TEMPORAL, + NUMERIC_TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, + getTokenStringLength(candidate), fillMode); tokens.add(lastAddedToken); return lastAddedToken; } - private Token parseTemporalToken(String originalPattern, int begin, String candidate) { - Token lastAddedToken; + private Token parseCharacterTemporalToken(String originalPattern, String candidate, + boolean fillMode, int begin) { + // keep original case + candidate = originalPattern.substring(begin, begin + candidate.length()); - // for AM/PM, keep original case - if (TEMPORAL_TOKENS.get(candidate) == ChronoField.AMPM_OF_DAY) { - int subStringEnd = begin + candidate.length(); - candidate = originalPattern.substring(begin, subStringEnd); - } - lastAddedToken = new Token(TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, - getTokenStringLength(candidate.toLowerCase())); + Token lastAddedToken = new Token(TokenType.CHARACTER_TEMPORAL, + CHARACTER_TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, + getTokenStringLength(candidate), fillMode); tokens.add(lastAddedToken); return lastAddedToken; } - private Token parseTimeZoneToken(String candidate) { - Token lastAddedToken; - lastAddedToken = new Token(TIME_ZONE_TOKENS.get(candidate), candidate, + private Token parseTimeZoneToken(String candidate, boolean fillMode, int begin) { + checkFillModeOff(fillMode, begin); + Token lastAddedToken = new Token(TIME_ZONE_TOKENS.get(candidate), candidate, getTokenStringLength(candidate)); tokens.add(lastAddedToken); return lastAddedToken; } + private Token parseTextToken(String fullPattern, boolean fillMode, int begin) { + checkFillModeOff(fillMode, begin); + int end = begin; + do { + end = fullPattern.indexOf('\"', end + 1); + if (end == -1) { + throw new IllegalArgumentException( + "Missing closing double quote (\") opened at index " + begin); + } + // if double quote is escaped with a backslash, keep looking for the closing quotation mark + } while ("\\".equals(fullPattern.substring(end - 1, end))); + Token lastAddedToken = new Token(TokenType.TEXT, fullPattern.substring(begin + 1, end)); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + /** + * @return true if FM. FX sets a field. + */ + private boolean setOrGetFormatModifier(String candidate, boolean fillMode, int begin) { + checkFillModeOff(fillMode, begin); + if ("fm".equals(candidate)) { + return true; + } else { //fx + formatExact = true; + return false; + } + } + + private void checkFillModeOff(boolean fillMode, int index) { + if (fillMode) { + throw new IllegalArgumentException("Bad date/time conversion pattern: " + pattern + + ". Error at index " + index + ": Fill mode modifier (FM) must " + + "be followed by a temporal token."); + } + } + private int getTokenStringLength(String candidate) { - Integer length = SPECIAL_LENGTHS.get(candidate); + Integer length = SPECIAL_LENGTHS.get(candidate.toLowerCase()); if (length != null) { return length; } @@ -480,6 +681,21 @@ private void verifyForParse() { timeZoneTemporalUnits.add(token.temporalUnit); } } + + //check for illegal temporal fields + if (temporalFields.contains(IsoFields.QUARTER_OF_YEAR)) { + throw new IllegalArgumentException("Illegal field: q (" + IsoFields.QUARTER_OF_YEAR + ")"); + } + if (temporalFields.contains(WeekFields.SUNDAY_START.dayOfWeek())) { + throw new IllegalArgumentException("Illegal field: d/dy/day (" + WeekFields.SUNDAY_START.dayOfWeek() + ")"); + } + if (temporalFields.contains(ChronoField.ALIGNED_WEEK_OF_MONTH)) { + throw new IllegalArgumentException("Illegal field: w (" + ChronoField.ALIGNED_WEEK_OF_MONTH + ")"); + } + if (temporalFields.contains(ChronoField.ALIGNED_WEEK_OF_YEAR)) { + throw new IllegalArgumentException("Illegal field: ww (" + ChronoField.ALIGNED_WEEK_OF_YEAR + ")"); + } + if (!(temporalFields.contains(ChronoField.YEAR))) { throw new IllegalArgumentException("Missing year token."); } @@ -541,6 +757,14 @@ private void verifyForFormat() { throw new IllegalArgumentException(token.string.toUpperCase() + " not a valid format for " + "timestamp or date."); } + if (token.type == TokenType.CHARACTER_TEMPORAL) { + String s = token.string; + if (!(s.equals(s.toUpperCase()) || s.equals(capitalize(s)) || s.equals(s.toLowerCase()))) { + throw new IllegalArgumentException( + "Ambiguous capitalization of token " + s + ". Accepted " + "forms are " + s + .toUpperCase() + ", " + capitalize(s) + ", or " + s.toLowerCase() + "."); + } + } } } @@ -552,10 +776,15 @@ public String format(Timestamp ts) { LocalDateTime.ofEpochSecond(ts.toEpochSecond(), ts.getNanos(), ZoneOffset.UTC); for (Token token : tokens) { switch (token.type) { - case TEMPORAL: + case NUMERIC_TEMPORAL: + case CHARACTER_TEMPORAL: try { value = localDateTime.get(token.temporalField); - outputString = formatTemporal(value, token); + if (token.type == TokenType.NUMERIC_TEMPORAL) { + outputString = formatNumericTemporal(value, token); + } else { + outputString = formatCharacterTemporal(value, token); + } } catch (DateTimeException e) { throw new IllegalArgumentException(token.temporalField + " couldn't be obtained from " + "LocalDateTime " + localDateTime, e); @@ -565,11 +794,9 @@ public String format(Timestamp ts) { throw new IllegalArgumentException(token.string.toUpperCase() + " not a valid format for " + "timestamp or date."); case SEPARATOR: + case TEXT: outputString = token.string; break; - case ISO_8601_DELIMITER: - outputString = token.string.toUpperCase(); - break; default: // won't happen } @@ -582,7 +809,7 @@ public String format(Date date) { return format(Timestamp.ofEpochSecond(date.toEpochSecond())); } - private String formatTemporal(int value, Token token) { + private String formatNumericTemporal(int value, Token token) { String output; if (token.temporalField == ChronoField.AMPM_OF_DAY) { output = value == 0 ? "a" : "p"; @@ -605,49 +832,65 @@ private String formatTemporal(int value, Token token) { return output; } - /** - * To match token.length, pad left with zeroes or truncate. - */ - private String padOrTruncateNumericTemporal(Token token, String output) { - if (output.length() < token.length) { - output = StringUtils.leftPad(output, token.length, '0'); // pad left - } else if (output.length() > token.length) { - if (token.temporalField == ChronoField.NANO_OF_SECOND) { - output = output.substring(0, token.length); // truncate right - } else { - output = output.substring(output.length() - token.length); // truncate left - } + private String formatCharacterTemporal(int value, Token token) { + String output = null; + if (token.temporalField == ChronoField.MONTH_OF_YEAR) { + output = Month.of(value).getDisplayName(TextStyle.FULL, Locale.US); + } else if (token.temporalField == WeekFields.SUNDAY_START.dayOfWeek()) { + output = WeekFields.SUNDAY_START.getFirstDayOfWeek().plus(value - 1) // values start at 1 + .getDisplayName(TextStyle.FULL, Locale.US); } - if (token.temporalField == ChronoField.NANO_OF_SECOND - && token.string.equalsIgnoreCase("ff")) { - output = output.replaceAll("0*$", ""); //truncate trailing 0's - if (output.isEmpty()) { - output = "0"; - } + if (output == null) { + throw new IllegalStateException("TemporalField: " + token.temporalField + " not valid for " + + "character formatting."); + } + + // set length + if (output.length() > token.length) { + output = output.substring(0, token.length); // truncate to length + } else if (!token.fillMode && output.length() < token.length) { + output = StringUtils.rightPad(output, token.length); //pad to size + } + + // set case + if (Character.isUpperCase(token.string.charAt(1))) { + output = output.toUpperCase(); + } else if (Character.isLowerCase(token.string.charAt(0))) { + output = output.toLowerCase(); } return output; } /** - * Left here for timestamp with local time zone. + * To match token.length, pad left with zeroes or truncate. + * Omit padding if fill mode (FM) modifier on. */ - private String formatTimeZone(TimeZone timeZone, LocalDateTime localDateTime, Token token) { - ZoneOffset offset = timeZone.toZoneId().getRules().getOffset(localDateTime); - Duration seconds = Duration.of(offset.get(ChronoField.OFFSET_SECONDS), ChronoUnit.SECONDS); - if (token.string.equals("tzh")) { - long hours = seconds.toHours(); - String s = (hours >= 0) ? "+" : "-"; - s += (Math.abs(hours) < 10) ? "0" : ""; - s += String.valueOf(Math.abs(hours)); - return s; + private String padOrTruncateNumericTemporal(Token token, String output) { + //exception + if (token.temporalField == ChronoField.NANO_OF_SECOND) { + output = StringUtils.leftPad(output, 9, '0'); // pad left to length 9 + if (output.length() > token.length) { + output = output.substring(0, token.length); // truncate right to size + } + if (token.string.equalsIgnoreCase("ff")) { + output = output.replaceAll("0*$", ""); //truncate trailing 0's + } + + // the rule } else { - long minutes = Math.abs(seconds.toMinutes() % MINUTES_PER_HOUR); - String s = String.valueOf(minutes); - if (s.length() == 1) { - s = "0" + s; + if (output.length() < token.length && !token.fillMode) { + output = StringUtils.leftPad(output, token.length, '0'); // pad left + } else if (output.length() > token.length) { + output = output.substring(output.length() - token.length); // truncate left + } + if (token.fillMode) { + output = output.replaceAll("^0*", ""); //truncate leading 0's } - return s; } + if (output.isEmpty()) { + output = "0"; + } + return output; } public Timestamp parseTimestamp(String fullInput){ @@ -659,9 +902,14 @@ public Timestamp parseTimestamp(String fullInput){ for (Token token : tokens) { switch (token.type) { - case TEMPORAL: + case NUMERIC_TEMPORAL: + case CHARACTER_TEMPORAL: substring = getNextSubstring(fullInput, index, token); // e.g. yy-m -> yy - value = parseTemporal(substring, token); // e.g. 18->2018, July->07 + if (token.type == TokenType.NUMERIC_TEMPORAL) { + value = parseNumericTemporal(substring, token); // e.g. 18->2018 + } else { + value = parseCharacterTemporal(substring, token); // e.g. July->07 + } try { ldt = ldt.with(token.temporalField, value); } catch (DateTimeException e){ @@ -707,8 +955,8 @@ public Timestamp parseTimestamp(String fullInput){ case SEPARATOR: index = parseSeparator(fullInput, index, token); break; - case ISO_8601_DELIMITER: - index = parseIso8601Delimiter(fullInput, index, token); + case TEXT: + index = parseText(fullInput, index, token); default: //do nothing } @@ -728,9 +976,9 @@ public Date parseDate(String input){ } /** * Return the next substring to parse. Length is either specified or token.length, but a - * separator or an ISO-8601 delimiter can cut the substring short. (e.g. if the token pattern is - * "YYYY" we expect the next 4 characters to be 4 numbers. However, if it is "976/" then we - * return "976" because a separator cuts it short.) + * separator can cut the substring short. (e.g. if the token pattern is "YYYY" we expect the next + * 4 characters to be 4 numbers. However, if it is "976/" then we return "976" because a + * separator cuts it short.) */ private String getNextSubstring(String s, int begin, Token token) { return getNextSubstring(s, begin, begin + token.length, token); @@ -748,16 +996,15 @@ private String getNextSubstring(String s, int begin, int end, Token token) { return s; } } - for (String sep : VALID_SEPARATORS) { - if (s.contains(sep)) { - s = s.substring(0, s.indexOf(sep)); - } - } - // TODO this will cause problems with DAY (for example, Thursday starts with T) - for (String delimiter : VALID_ISO_8601_DELIMITERS) { - if (s.toLowerCase().contains(delimiter)) { - s = s.substring(0, s.toLowerCase().indexOf(delimiter)); - } + // if it's a character temporal, the first non-letter character is a delimiter + if (token.type == TokenType.CHARACTER_TEMPORAL && s.matches(".*[^A-Za-z].*")) { + s = s.split("[^A-Za-z]", 2)[0]; + + // if it's a numeric element, next non-numeric character is a delimiter. Don't worry about + // AM/PM since we've already handled that case. + } else if ((token.type == TokenType.NUMERIC_TEMPORAL || token.type == TokenType.TIMEZONE) + && s.matches(".*\\D.*")) { + s = s.split("\\D", 2)[0]; } return s; @@ -765,8 +1012,11 @@ private String getNextSubstring(String s, int begin, int end, Token token) { /** * Get the integer value of a temporal substring. + * @throws IllegalArgumentException */ - private int parseTemporal(String substring, Token token){ + private int parseNumericTemporal(String substring, Token token){ + checkFormatExact(substring, token); + // exceptions to the rule if (token.temporalField == ChronoField.AMPM_OF_DAY) { return substring.toLowerCase().startsWith("a") ? AM : PM; @@ -805,6 +1055,40 @@ private int parseTemporal(String substring, Token token){ } } + private int parseCharacterTemporal(String substring, Token token) { + try { + if (token.temporalField == ChronoField.MONTH_OF_YEAR) { + if (token.length == 3) { + return Month.from(MONTH_FORMATTER.parse(capitalize(substring))).getValue(); + } else { + return Month.valueOf(substring.toUpperCase()).getValue(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Couldn't parse substring \"" + substring + "\" with token " + token + " to integer. Pattern is " + pattern, e); + } + throw new IllegalArgumentException( + "token: (" + token + ") isn't a valid as a character to be parsed. Pattern is " + pattern); + } + + /** + * @throws IllegalArgumentException if input length doesn't match expected (token) length + */ + private void checkFormatExact(String substring, Token token) { + // AM/PM defaults to length 4 but make it 2 for FX check if the pattern actually has length 2 + if (formatExact && token.temporalField == ChronoField.AMPM_OF_DAY) { + token.length = token.string.length(); + } + if (formatExact + && !(token.fillMode || token.temporalField == ChronoField.NANO_OF_SECOND) + && token.length != substring.length()) { + throw new IllegalArgumentException( + "FX on and expected token length " + token.length + " for token " + token.toString() + + " does not match substring (" + substring + ") length " + substring.length()); + } + } + /** * Parse the next separator(s). At least one separator character is expected. Separator * characters are interchangeable. @@ -814,38 +1098,45 @@ private int parseTemporal(String substring, Token token){ * separator, UNLESS this is the only separator character in the separator substring (in * which case it is not counted as the negative sign). * - * @throws IllegalArgumentException if separator is missing + * @throws IllegalArgumentException if separator is missing or if FX is on and separator doesn't + * match the expected separator pattern exactly */ - private int parseSeparator(String fullInput, int index, Token token){ - int separatorsFound = 0; + private int parseSeparator(String fullInput, int index, Token token) { int begin = index; + String s; + StringBuilder separatorsFound = new StringBuilder(); while (index < fullInput.length() && VALID_SEPARATORS.contains(fullInput.substring(index, index + 1))) { + s = fullInput.substring(index, index + 1); if (!isLastCharacterOfSeparator(index, fullInput) - || !("-".equals(fullInput.substring(index, index + 1)) && (nextTokenIs("tzh", token))) - || separatorsFound == 0) { - separatorsFound++; + || !("-".equals(s) && (nextTokenIs("tzh", token))) + || separatorsFound.length() == 0) { + separatorsFound.append(s); } index++; } - if (separatorsFound == 0) { + if (separatorsFound.length() == 0) { throw new IllegalArgumentException("Missing separator at index " + index); } - return begin + separatorsFound; + if (formatExact && !token.string.equals(separatorsFound.toString())) { + throw new IllegalArgumentException("FX on and separator found: " + separatorsFound.toString() + + " doesn't match expected separator: " + token.string); + } + + return begin + separatorsFound.length(); } - private int parseIso8601Delimiter(String fullInput, int index, Token token) { + private int parseText(String fullInput, int index, Token token) { String substring; - substring = fullInput.substring(index, index + 1); - if (token.string.equalsIgnoreCase(substring)) { - index++; - } else { + substring = fullInput.substring(index, index + token.length); + if (!token.string.equalsIgnoreCase(substring)) { throw new IllegalArgumentException( - "Missing ISO 8601 delimiter " + token.string.toUpperCase()); + "Wrong input at index " + index + ": Expected: \"" + token.string + "\" but got: \"" + + substring + "\" for token: " + token); } - return index; + return index + token.length; } /** @@ -869,7 +1160,8 @@ private boolean nextTokenIs(String pattern, Token currentToken) { Token nextToken = tokens.get(tokens.indexOf(currentToken) + 1); pattern = pattern.toLowerCase(); return (isTimeZoneToken(pattern) && TIME_ZONE_TOKENS.get(pattern) == nextToken.temporalUnit - || isTemporalToken(pattern) && TEMPORAL_TOKENS.get(pattern) == nextToken.temporalField); + || isNumericTemporalToken(pattern) && NUMERIC_TEMPORAL_TOKENS.get(pattern) == nextToken.temporalField + || isCharacterTemporalToken(pattern) && CHARACTER_TEMPORAL_TOKENS.get(pattern) == nextToken.temporalField); } public String getPattern() { @@ -882,4 +1174,8 @@ public String getPattern() { protected List getTokens() { return new ArrayList<>(tokens); } + + private static String capitalize(String substring) { + return WordUtils.capitalize(substring.toLowerCase()); + } } diff --git a/common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java b/common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java index 4e822d53f9..618ab97e2a 100644 --- a/common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java +++ b/common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java @@ -25,22 +25,10 @@ import java.time.LocalDate; import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeFormatterBuilder; -import java.time.format.ResolverStyle; -import java.time.format.SignStyle; import java.time.temporal.ChronoField; import java.time.temporal.TemporalField; import java.util.ArrayList; -import static java.time.temporal.ChronoField.DAY_OF_MONTH; -import static java.time.temporal.ChronoField.HOUR_OF_DAY; -import static java.time.temporal.ChronoField.MINUTE_OF_HOUR; -import static java.time.temporal.ChronoField.MONTH_OF_YEAR; -import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; -import static java.time.temporal.ChronoField.YEAR; - /** * Tests HiveSqlDateTimeFormatter. */ @@ -74,6 +62,7 @@ public void testSetPattern() { } public void testSetPatternWithBadPatterns() { + verifyBadPattern("", true); verifyBadPattern("eyyyy-ddd", true); verifyBadPattern("1yyyy-mm-dd", true); @@ -97,32 +86,72 @@ public void testSetPatternWithBadPatterns() { verifyBadPattern("yyyy-mm-dd SSSSS AM", true); verifyBadPattern("yyyy-mm-dd MI SSSSS", true); verifyBadPattern("yyyy-mm-dd SS SSSSS", true); + verifyBadPattern("yyyy mm-MON dd", true); + verifyBadPattern("yyyy mm-MONTH dd", true); + verifyBadPattern("yyyy MON, month dd", true); verifyBadPattern("tzm", false); verifyBadPattern("tzh", false); + + //ambiguous case for formatting + verifyBadPattern("MOnth", false); + verifyBadPattern("DaY", false); + verifyBadPattern("dAy", false); + verifyBadPattern("dY", false); + + //illegal for parsing + verifyBadPattern("yyyy-mm-dd q", true); + verifyBadPattern("yyyy-mm-dd d", true); + verifyBadPattern("yyyy-mm-dd dy", true); + verifyBadPattern("yyyy-mm-dd day", true); + verifyBadPattern("yyyy-mm-dd w", true); + verifyBadPattern("yyyy-mm-dd ww", true); } public void testFormatTimestamp() { checkFormatTs("rr rrrr ddd", "2018-01-03 00:00:00", "18 2018 003"); - checkFormatTs("yyyy-mm-ddtsssss.ff4z", "2018-02-03 00:00:10.777777777", "2018-02-03T00010.7777Z"); + checkFormatTs("yyyy-mm-dd sssss.ff4", "2018-02-03 00:00:10.777777777", "2018-02-03 00010.7777"); checkFormatTs("hh24:mi:ss.ff1", "2018-02-03 01:02:03.999999999", "01:02:03.9"); - checkFormatTs("y yyy hh:mi:ss.ffz", "2018-02-03 01:02:03.0070070", "8 018 01:02:03.007007Z"); + checkFormatTs("y yyy hh:mi:ss.ff", "2018-02-03 01:02:03.0070070", "8 018 01:02:03.007007"); checkFormatTs("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03 01:02:03.0070070", "am a.m. am a.m. AM A.M. AM A.M."); checkFormatTs("HH12 P.M.", "2019-01-01 00:15:10", "12 A.M."); checkFormatTs("HH12 AM", "2019-01-01 12:15:10", "12 PM"); checkFormatTs("YYYY-MM-DD HH12PM", "2017-05-05 00:00:00", "2017-05-05 12AM"); + + checkFormatTs("YYYY-MONTH-DD", "2019-01-01 00:00:00", "2019-JANUARY -01"); //fill to length 9 + checkFormatTs("YYYY-Month-DD", "2019-01-01 00:00:00", "2019-January -01"); + checkFormatTs("YYYY-month-DD", "2019-01-01 00:00:00", "2019-january -01"); + checkFormatTs("YYYY-MON-DD", "2019-01-01 00:00:00", "2019-JAN-01"); + checkFormatTs("YYYY-Mon-DD", "2019-01-01 00:00:00", "2019-Jan-01"); + checkFormatTs("YYYY-mon-DD", "2019-01-01 00:00:00", "2019-jan-01"); + + checkFormatTs("D: DAY", "2019-01-01 00:00:00", "3: TUESDAY "); //fill to length 9 + checkFormatTs("D: Day", "2019-01-02 00:00:00", "4: Wednesday"); + checkFormatTs("D: day", "2019-01-03 00:00:00", "5: thursday "); + checkFormatTs("D: DY", "2019-01-04 00:00:00", "6: FRI"); + checkFormatTs("D: Dy", "2019-01-05 00:00:00", "7: Sat"); + checkFormatTs("D: dy", "2019-01-06 00:00:00", "1: sun"); + checkFormatTs("D: DAY", "2019-01-07 00:00:00", "2: MONDAY "); + + checkFormatTs("YYYY-mm-dd: Q WW W", "2019-01-01 00:00:00", "2019-01-01: 1 01 1"); + checkFormatTs("YYYY-mm-dd: Q WW W", "2019-01-07 00:00:00", "2019-01-07: 1 01 1"); + checkFormatTs("YYYY-mm-dd: Q WW W", "2019-01-08 00:00:00", "2019-01-08: 1 02 2"); + checkFormatTs("YYYY-mm-dd: Q WW W", "2019-03-31 00:00:00", "2019-03-31: 1 13 5"); + checkFormatTs("YYYY-mm-dd: Q WW W", "2019-04-01 00:00:00", "2019-04-01: 2 13 1"); + checkFormatTs("YYYY-mm-dd: Q WW W", "2019-12-31 00:00:00", "2019-12-31: 4 53 5"); } private void checkFormatTs(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, false); - assertEquals(expectedOutput, formatter.format(toTimestamp(input))); + assertEquals("Format timestamp to string failed with pattern: " + pattern, + expectedOutput, formatter.format(Timestamp.valueOf(input))); } public void testFormatDate() { checkFormatDate("rr rrrr ddd", "2018-01-03", "18 2018 003"); - checkFormatDate("yyyy-mm-ddtsssss.ff4z", "2018-02-03", "2018-02-03T00000.0000Z"); + checkFormatDate("yyyy-mm-dd sssss.ff4 ", "2018-02-03", "2018-02-03 00000.0000 "); checkFormatDate("hh24:mi:ss.ff1", "2018-02-03", "00:00:00.0"); - checkFormatDate("y yyy T hh:mi:ss.ff am z", "2018-02-03", "8 018 T 12:00:00.0 am Z"); + checkFormatDate("y yyy hh:mi:ss.ff am", "2018-02-03", "8 018 12:00:00.0 am"); checkFormatDate("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03", "am a.m. am a.m. AM A.M. AM A.M."); checkFormatDate("DDD", "2019-12-31", "365"); checkFormatDate("DDD", "2020-12-31", "366"); @@ -130,7 +159,8 @@ public void testFormatDate() { private void checkFormatDate(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, false); - assertEquals(expectedOutput, formatter.format(toDate(input))); + assertEquals("Format date to string failed with pattern: " + pattern, + expectedOutput, formatter.format(Date.valueOf(input))); } public void testParseTimestamp() { @@ -156,12 +186,12 @@ public void testParseTimestamp() { checkParseTimestamp("rrrr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03 00:00:00"); //everything else - checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ff8z", "2018-02-03T04:05:06.5665Z", "2018-02-03 04:05:06.5665"); + checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff8", "2018-02-03 04:05:06.5665", "2018-02-03 04:05:06.5665"); checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff", "2018-02-03 04:05:06.555555555", "2018-02-03 04:05:06.555555555"); checkParseTimestamp("yyyy-mm-dd hh12:mi:ss", "2099-2-03 04:05:06", "2099-02-03 04:05:06"); checkParseTimestamp("yyyyddd", "2018284", "2018-10-11 00:00:00"); checkParseTimestamp("yyyyddd", "20184", "2018-01-04 00:00:00"); - checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ffz", "2018-02-03t04:05:06.444Z", "2018-02-03 04:05:06.444"); + checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff ", "2018-02-03 04:05:06.444 ", "2018-02-03 04:05:06.444"); checkParseTimestamp("yyyy-mm-dd hh:mi:ss A.M.", "2018-02-03 04:05:06 P.M.", "2018-02-03 16:05:06"); checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00--1:-30", "2019-01-01 14:00:00"); checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00-1:30", "2019-01-01 14:00:00"); @@ -189,6 +219,21 @@ public void testParseTimestamp() { checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM+0515", "2018-12-31 08:00:00"); checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM0515", "2018-12-31 08:00:00"); checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM-0515", "2018-12-31 08:00:00"); + + //MONTH, MON : case really doesn't matter + checkParseTimestamp("yyyy-MONTH-dd", "2018-FEBRUARY-28", "2018-02-28 00:00:00"); + checkParseTimestamp("yyyy-Month-dd", "2018-february-28", "2018-02-28 00:00:00"); + checkParseTimestamp("yyyy-month-dd", "2018-FEBRUARY-28", "2018-02-28 00:00:00"); + checkParseTimestamp("yyyy-montH-dd", "2018-febRuary-28", "2018-02-28 00:00:00"); + checkParseTimestamp("yyyy-MON-dd", "2018-FEB-28", "2018-02-28 00:00:00"); + checkParseTimestamp("yyyy-moN-dd", "2018-FeB-28", "2018-02-28 00:00:00"); + checkParseTimestamp("yyyy-mon-dd", "2018-FEB-28", "2018-02-28 00:00:00"); + verifyBadParseString("yyyy-MON-dd", "2018-FEBRUARY-28"); + verifyBadParseString("yyyy-MON-dd", "2018-FEBR-28"); + verifyBadParseString("yyyy-MONTH-dd", "2018-FEB-28"); + //letters and numbers are delimiters to each other, respectively + checkParseDate("yyyy-ddMONTH", "2018-4March", "2018-03-04"); + checkParseDate("yyyy-MONTHdd", "2018-March4", "2018-03-04"); } private int getFirstTwoDigits() { @@ -202,7 +247,8 @@ private int getFirstTwoDigits() { private void checkParseTimestamp(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, true); - assertEquals(toTimestamp(expectedOutput), formatter.parseTimestamp(input)); + assertEquals("Parse string to timestamp failed. Pattern: " + pattern, + Timestamp.valueOf(expectedOutput), formatter.parseTimestamp(input)); } public void testParseDate() { @@ -232,30 +278,126 @@ public void testParseDate() { private void checkParseDate(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, true); - assertEquals(toDate(expectedOutput), formatter.parseDate(input)); + assertEquals("Parse string to date failed. Pattern: " + pattern, + Date.valueOf(expectedOutput), formatter.parseDate(input)); } public void testParseTimestampError() { - verifyBadParseString("yyyy", "2019-02-03"); verifyBadParseString("yyyy-mm-dd ", "2019-02-03"); //separator missing verifyBadParseString("yyyy-mm-dd", "2019-02-03..."); //extra separators verifyBadParseString("yyyy-mm-dd hh12:mi:ss", "2019-02-03 14:00:00"); //hh12 out of range - verifyBadParseString("yyyy-dddsssss", "2019-912345"); + verifyBadParseString("yyyy-dddsssss", "2019-912345"); //ddd out of range verifyBadParseString("yyyy-mm-dd", "2019-13-23"); //mm out of range verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +16:00"); //tzh out of range verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +14:60"); //tzm out of range verifyBadParseString("YYYY DDD", "2000 367"); //ddd out of range + verifyBadParseString("yyyy-month-dd", "2019-merch-23"); //invalid month of year + verifyBadParseString("yyyy-mon-dd", "2019-mer-23"); //invalid month of year } private void verifyBadPattern(String string, boolean forParsing) { try { formatter = new HiveSqlDateTimeFormatter(string, forParsing); - fail(); + fail("Bad pattern " + string + " should have thrown IllegalArgumentException but didn't"); } catch (Exception e) { - assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + assertEquals("Expected IllegalArgumentException, got another exception.", + e.getClass().getName(), IllegalArgumentException.class.getName()); } } + public void testFmFx() { + //fm + //year (019) becomes 19 even if pattern is yyy + checkFormatTs("FMyyy-FMmm-dd FMHH12:MI:FMSS", "2019-01-01 01:01:01", "19-1-01 1:01:1"); + //ff[1-9] shouldn't be affected, because leading zeroes hold information + checkFormatTs("FF5/FMFF5", "2019-01-01 01:01:01.0333", "03330/03330"); + checkFormatTs("FF/FMFF", "2019-01-01 01:01:01.0333", "0333/0333"); + //omit trailing spaces from character temporal elements + checkFormatTs("YYYY-fmMonth-DD", "2019-01-01 00:00:00", "2019-January-01"); + checkFormatTs("D: fmDAY", "2019-01-01 00:00:00", "3: TUESDAY"); + checkFormatTs("D: fmDay", "2019-01-02 00:00:00", "4: Wednesday"); + + //only affects temporals that immediately follow + verifyBadPattern("yyy-mm-dd FM,HH12", false); + verifyBadPattern("yyy-mm-dd FM,HH12", true); + verifyBadPattern("yyy-mm-dd HH12 tzh:fmtzm", true); + verifyBadPattern("FMFMyyy-mm-dd", true); + verifyBadPattern("FMFXDD-MM-YYYY ff2", true); + + //fx + checkParseDate("FXDD-MM-YYYY", "01-01-1998", "1998-01-01"); + checkParseTimestamp("FXDD-MM-YYYY hh12:mi:ss.ff", "15-01-1998 11:12:13.0", "1998-01-15 11:12:13"); + //ff[1-9] are exempt + checkParseTimestamp("FXDD-MM-YYYY hh12:mi:ss.ff6", "01-01-1998 00:00:00.4440", "1998-01-01 00:00:00.444"); + //fx can be anywhere in the pattern string + checkParseTimestamp("DD-MM-YYYYFX", "01-01-1998", "1998-01-01 00:00:00"); + verifyBadParseString("DD-MM-YYYYFX", "1-01-1998"); + //same separators required + verifyBadParseString("FXDD-MM-YYYY", "15/01/1998"); + //no filling in zeroes or year digits + verifyBadParseString("FXDD-MM-YYYY", "1-01-1998"); + verifyBadParseString("FXDD-MM-YYYY", "01-01-98"); + //no leading or trailing whitespace + verifyBadParseString("FXDD-MM-YYYY", " 01-01-1998 "); + //enforce correct amount of leading zeroes + verifyBadParseString("FXyyyy-mm-dd hh:miss PM", "2018-01-01 17:005 PM"); + verifyBadParseString("FXyyyy-mm-dd sssss", "2019-01-01 003"); + //text case does not matter + checkParseTimestamp("\"the DATE is\" yyyy-mm-dd", "the date is 2018-01-01", "2018-01-01 00:00:00"); + //AM/PM length has to match, but case doesn't + checkParseTimestamp("FXDD-MM-YYYY hh12 am", "01-01-1998 12 PM", "1998-01-01 12:00:00"); + checkParseTimestamp("FXDD-MM-YYYY hh12 A.M.", "01-01-1998 12 p.m.", "1998-01-01 12:00:00"); + verifyBadParseString("FXDD-MM-YYYY hh12 am", "01-01-1998 12 p.m."); + verifyBadParseString("FXDD-MM-YYYY hh12 a.m.", "01-01-1998 12 pm"); + //character temporals shouldn't have trailing spaces + checkParseTimestamp("FXDD-month-YYYY", "15-March-1998", "1998-03-15 00:00:00"); + + //fm modifies fx + checkParseTimestamp("FXDD-FMMM-YYYY hh12 am", "01-1-1998 12 PM", "1998-01-01 12:00:00"); + checkParseTimestamp("FXFMDD-MM-YYYY hh12 am", "1-01-1998 12 PM", "1998-01-01 12:00:00"); + //ff[1-9] unaffected + checkParseTimestamp("FXFMDD-MM-YYYY FMff2", "1-01-1998 4", "1998-01-01 00:00:00.4"); + checkParseTimestamp("FXFMDD-MM-YYYY ff2", "1-01-1998 4", "1998-01-01 00:00:00.4"); + } + + public void testText() { + // keep exact text upon format + checkFormatTs("hh24:mi \" Is \" hh12 PM\".\"", "2008-01-01 17:00:00", "17:00 Is 05 PM."); + checkFormatDate("\" `the _year_ is` \" yyyy\".\"", "2008-01-01", " `the _year_ is` 2008."); + // empty text strings work + checkParseTimestamp("\"\"yyyy\"\"-mm-dd\"\"", "2019-01-01", "2019-01-01 00:00:00"); + checkParseDate("\"\"yyyy\"\"-mm-dd\"\"", "2019-01-01", "2019-01-01"); + // Case doesn't matter upon parsing + checkParseTimestamp("\"Year \"YYYY \"month\" MM \"day\" DD.\"!\"", + "YEaR 3000 mOnTh 3 DaY 1...!", "3000-03-01 00:00:00"); + checkParseDate("\"Year \"YYYY \"month\" MM \"day\" DD.\"!\"", + "YEaR 3000 mOnTh 3 DaY 1...!", "3000-03-01"); + // Characters matter upon parsing + verifyBadParseString("\"Year! \"YYYY \"m\" MM \"d\" DD.\"!\"", "Year 3000 m 3 d 1,!"); + // non-numeric characters in text counts as a delimiter + checkParseDate("yyyy\"m\"mm\"d\"dd", "19m1d1", LocalDate.now().getYear() / 100 + "19-01-01"); + checkParseDate("yyyy\"[\"mm\"]\"dd", "19[1]1", LocalDate.now().getYear() / 100 + "19-01-01"); + + // single quotes are separators and not text delimiters + checkParseTimestamp("\"Y\'ear \"YYYY \' \"month\" MM \"day\" DD.\"!\"", + "Y'EaR 3000 ' mOnTh 3 DaY 1...!", "3000-03-01 00:00:00"); + checkParseDate("\"Y\'ear \"YYYY \' \"month\" MM \"day\" DD.\"!\"", + "Y'EaR 3000 ' mOnTh 3 DaY 1...!", "3000-03-01"); + // literal double quotes are escaped + checkFormatTs("\"the \\\"DATE\\\" is\" yyyy-mm-dd", + "2018-01-01 00:00:00", "the \"DATE\" is 2018-01-01"); + checkFormatTs("\"\\\"\\\"\\\"\"", "2018-01-01 00:00:00", "\"\"\""); + checkParseTimestamp("\"the \\\"DATE\\\" is\" yyyy-mm-dd", + "the \"date\" is 2018-01-01", "2018-01-01 00:00:00"); + // Check variations of apostrophes, literal and non-literal double quotes + checkParseTimestamp("yyyy'\"\"mm-dd", "2019\'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\"mm-dd", "2019\'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy'\"\"mm-dd", "2019'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\"mm-dd", "2019'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\\\"\"mm-dd", "2019'\"01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\\\"\"mm-dd", "2019\'\"01-01", "2019-01-01 00:00:00"); + } + /** * Verify pattern is parsed correctly. * Check: @@ -268,7 +410,7 @@ private void verifyPatternParsing(String pattern, ArrayList tempo } private void verifyPatternParsing(String pattern, int expectedPatternLength, - String expectedPattern, ArrayList temporalFields) { + String expectedPattern, ArrayList temporalFields) { formatter = new HiveSqlDateTimeFormatter(pattern, false); assertEquals(temporalFields.size(), formatter.getTokens().size()); StringBuilder sb = new StringBuilder(); @@ -285,46 +427,14 @@ private void verifyPatternParsing(String pattern, int expectedPatternLength, } private void verifyBadParseString(String pattern, String string) { + formatter = new HiveSqlDateTimeFormatter(pattern, true); try { - formatter = new HiveSqlDateTimeFormatter(pattern, true); formatter.parseTimestamp(string); - fail(); + fail("Parse string to timestamp should have failed.\nString: " + string + "\nPattern: " + + pattern); } catch (Exception e) { - assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + assertEquals("Expected IllegalArgumentException, got another exception.", + e.getClass().getName(), IllegalArgumentException.class.getName()); } } - - - // Methods that construct datetime objects using java.time.DateTimeFormatter. - - public static Date toDate(String s) { - LocalDate localDate = LocalDate.parse(s, DATE_FORMATTER); - return Date.ofEpochDay((int) localDate.toEpochDay()); - } - - /** - * This is effectively the old Timestamp.valueOf method. - */ - public static Timestamp toTimestamp(String s) { - LocalDateTime localDateTime = LocalDateTime.parse(s.trim(), TIMESTAMP_FORMATTER); - return Timestamp.ofEpochSecond( - localDateTime.toEpochSecond(ZoneOffset.UTC), localDateTime.getNano()); - } - - private static final DateTimeFormatter DATE_FORMATTER = - DateTimeFormatter.ofPattern("yyyy-MM-dd"); - private static final DateTimeFormatter TIMESTAMP_FORMATTER; - static { - DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); - builder.appendValue(YEAR, 1, 10, SignStyle.NORMAL).appendLiteral('-') - .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NORMAL).appendLiteral('-') - .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NORMAL) - .optionalStart().appendLiteral(" ") - .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NORMAL).appendLiteral(':') - .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NORMAL).appendLiteral(':') - .appendValue(SECOND_OF_MINUTE, 1, 2, SignStyle.NORMAL) - .optionalStart().appendFraction(ChronoField.NANO_OF_SECOND, 1, 9, true).optionalEnd() - .optionalEnd(); - TIMESTAMP_FORMATTER = builder.toFormatter().withResolverStyle(ResolverStyle.LENIENT); - } } diff --git a/ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q b/ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q index 269edf6da6..e2e56913e6 100644 --- a/ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q +++ b/ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q @@ -37,6 +37,11 @@ from varchars select cast (s as date format "yyyy.mm.dd"); from chars select cast (s as timestamp format "yyyy.mm.dd"); from chars select cast (s as date format "yyyy.mm.dd"); +--quotation marks, apostrophes, and literal quotation marks are handled correctly +select +cast ("2019\' \' '' 01-01" as timestamp format "yyyy\'' \'' mm-dd"), +cast ("2019\"01-01" as timestamp format "yyyy\"\\\"\"mm-dd"), +cast ('2019\' " \' 01-01' as timestamp format 'yyyy\' "\\" \'" mm-dd'); --correct descriptions explain from strings select cast (s as timestamp format "yyy.mm.dd"); diff --git a/ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out b/ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out index 4a502b9700..b9526f3b30 100644 --- a/ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out +++ b/ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out @@ -227,6 +227,21 @@ POSTHOOK: Input: default@chars #### A masked pattern was here #### 2020-02-03 1969-12-31 +PREHOOK: query: select +cast ("2019\' \' '' 01-01" as timestamp format "yyyy\'' \'' mm-dd"), +cast ("2019\"01-01" as timestamp format "yyyy\"\\\"\"mm-dd"), +cast ('2019\' " \' 01-01' as timestamp format 'yyyy\' "\\" \'" mm-dd') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select +cast ("2019\' \' '' 01-01" as timestamp format "yyyy\'' \'' mm-dd"), +cast ("2019\"01-01" as timestamp format "yyyy\"\\\"\"mm-dd"), +cast ('2019\' " \' 01-01' as timestamp format 'yyyy\' "\\" \'" mm-dd') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +2019-01-01 00:00:00 2019-01-01 00:00:00 2019-01-01 00:00:00 PREHOOK: query: explain from strings select cast (s as timestamp format "yyy.mm.dd") PREHOOK: type: QUERY PREHOOK: Input: default@strings