diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java index 4e024a357b..d106565e2d 100644 --- common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java @@ -25,7 +25,6 @@ import org.apache.hadoop.hive.common.type.Timestamp; import java.time.DateTimeException; -import java.time.Duration; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneOffset; @@ -37,13 +36,13 @@ import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.TimeZone; /** * Formatter using SQL:2016 datetime patterns. * * For all tokens: - * - Patterns are case-insensitive, except AM/PM and T/Z. See these sections for more details. + * - Patterns are case-insensitive, except AM/PM and nested strings. See these sections for more + * details. * - For string to datetime conversion, no duplicate format tokens are allowed, including tokens * that have the same meaning but different lengths ("Y" and "YY" conflict) or different * behaviors ("RR" and "YY" conflict). @@ -53,12 +52,18 @@ * "Y": 1, and so on), with some exceptions (see map SPECIAL_LENGTHS). * - For string to datetime conversion, inputs of fewer digits than expected are accepted if * followed by a delimiter, e.g. format="YYYY-MM-DD", input="19-1-1", output=2019-01-01 00:00:00. + * This is modified by format modifier FX (format exact). See FX for details. * - For datetime to string conversion, output is left padded with zeros, e.g. format="DD SSSSS", * input=2019-01-01 00:00:03, output="01 00003". + * This is modified by format modifier FM (fill mode). See FM for details. * * * Accepted format tokens: - * Note: "|" means "or". "Delimiter" means a separator, tokens T or Z, or end of input. + * Note: - "|" means "or". + * - "Delimiter" means a separator, the beginning of a nested string (must be obvious – e.g. + * if the token in question is numeric, any non-numeric character is a delimiter), or end + * of input. + * - The words token and pattern are used interchangeably. * * A. Temporal tokens * YYYY @@ -67,7 +72,6 @@ * from current date * E.g. input=‘9-01-01’, pattern =‘YYYY-MM-DD’, current year=2020, output=2029-01-01 00:00:00 * - * * YYY * Last 3 digits of a year * - Gets the prefix digit from current date. @@ -148,8 +152,8 @@ * - In string to datetime conversion, fewer digits than expected are accepted if followed by a * delimiter. "FF" acts like "FF9". * - * AM|A.M. - * Meridiem indicator or AM/PM + * AM|A.M.|PM|P.M. + * Meridiem indicator (or AM/PM) * - Datetime to string conversion: * - AM and PM mean the exact same thing in the pattern. * e.g. input=2019-01-01 20:00, format=“AM”, output=“PM”. @@ -162,10 +166,6 @@ * E.g. input="2019-01-01 11:00 p.m.", pattern="YYYY-MM-DD HH12:MI AM", * output=2019-01-01 23:00:00 * - * PM|P.M. - * Meridiem indicator - * See AM|A.M. - * * B. Time zone tokens * TZH * Time zone offset hour (-15 to +15) @@ -197,30 +197,58 @@ * by a time zone hour (tzh) token, it's a negative sign and not counted as a separator, UNLESS * this is the only possible separator character in the separator substring (in which case it is * not counted as the tzh's negative sign). + * - If the whole pattern string is delimited by single quotes (''), then the apostrophe separator + * (') must be escaped with a single backslash: (\'). + * + * D. Nested strings (Text) + * – Surround with double quotes (") in the pattern. Note, if the whole pattern string is delimited + * by double quotes, then the double quotes must be escaped with a single backslash: (\"). + * - In order to include a literal double quote character within the nested string, the double + * quote character must be escaped with a double backslash: (\\”). If the whole pattern string is + * delimited by double quotes, then escape with a triple backslash: (\\\") + * - If the whole pattern string is delimited by single quotes, literal single + * quotes/apostrophes (') in the nested string must be escaped with a single backslash: (\') + * - For datetime to string conversion, we simply include the string in the output, preserving the + * characters' case. + * - For string to datetime conversion, the information is lost as the nested string won’t be part + * of the resulting datetime object. However, the nested string has to match the related part of + * the input string, except case may differ. * - * D. ISO 8601 delimiters - * T - * ISO 8601 delimiter - * - Serves as a delimiter. - * - Function is to support formats like “YYYY-MM-DDTHH24:MI:SS.FF9Z”, “YYYY-MM-DD-HH24:MI:SSZ” - * - For datetime to string conversion, output is always capitalized ("T"), even if lowercase ("t") - * is provided in the pattern. + * E. Format modifier tokens + * FM + * Fill mode modifier + * - Default for string to datetime conversion. Inputs of fewer digits than expected are accepted + * if followed by a delimiter: + * e.g. format="YYYY-MM-DD", input="19-1-1", output=2019-01-01 00:00:00 + * - For datetime to string conversion, padding (trailing spaces for text data and leading zeroes + * for numeric data) is omitted for the temporal element immediately following an "FM" in the + * pattern string. If the element following is not a temporal element (for example, if "FM" + * precedes a separator), an error will be thrown. + * e.g. pattern=FMHH12:MI:FMSS, input=2019-01-01 01:01:01, output=1:01:1 + * - Modifies FX so that lack of leading zeroes are accepted for the element immediately following + * an "FM" in the pattern string. * - * Z - * ISO 8601 delimiter - * See T. + * FX + * Format exact modifier + * - Default for datetime to string conversion. Numeric output is left padded with zeros, and + * non-numeric output except for AM/PM is right padded with spaces up to expected length. + * - Applies to the whole pattern. + * - Rules applied at string to datetime conversion: + * - Separators must match exactly, down to the character. + * - Numeric input can't omit leading zeroes. This rule does not apply to elements (tokens) + * immediately preceded by an "FM." */ public class HiveSqlDateTimeFormatter { private static final int LONGEST_TOKEN_LENGTH = 5; private static final int LONGEST_ACCEPTED_PATTERN = 100; // for sanity's sake - private static final long MINUTES_PER_HOUR = 60; private static final int NANOS_MAX_LENGTH = 9; public static final int AM = 0; public static final int PM = 1; private String pattern; private List tokens = new ArrayList<>(); + private boolean formatExact = false; private static final Map TEMPORAL_TOKENS = ImmutableMap.builder() @@ -249,12 +277,12 @@ ImmutableMap.builder() .put("tzh", ChronoUnit.HOURS).put("tzm", ChronoUnit.MINUTES).build(); - private static final List VALID_ISO_8601_DELIMITERS = - ImmutableList.of("t", "z"); - private static final List VALID_SEPARATORS = ImmutableList.of("-", ":", " ", ".", "/", ";", "\'", ","); + private static final List VALID_FORMAT_MODIFIERS = + ImmutableList.of("fm", "fx"); + private static final Map SPECIAL_LENGTHS = ImmutableMap.builder() .put("hh12", 2).put("hh24", 2).put("tzm", 2).put("am", 4).put("pm", 4) .put("ff1", 1).put("ff2", 2).put("ff3", 3).put("ff4", 4).put("ff5", 5) @@ -268,7 +296,7 @@ TEMPORAL, SEPARATOR, TIMEZONE, - ISO_8601_DELIMITER + TEXT } /** @@ -280,29 +308,31 @@ TemporalUnit temporalUnit; // for type TIMEZONE e.g. ChronoUnit.HOURS String string; // pattern string, e.g. "yyy" int length; // length (e.g. YYY: 3, FF8: 8) + boolean fillMode; //FM, applies to type TEMPORAL only (later should apply to TIMEZONE as well) - public Token(TemporalField temporalField, String string, int length) { - this(TokenType.TEMPORAL, temporalField, null, string, length); + public Token(TemporalField temporalField, String string, int length, boolean fillMode) { + this(TokenType.TEMPORAL, temporalField, null, string, length, fillMode); } public Token(TemporalUnit temporalUnit, String string, int length) { - this(TokenType.TIMEZONE, null, temporalUnit, string, length); + this(TokenType.TIMEZONE, null, temporalUnit, string, length, false); } public Token(TokenType tokenType, String string) { - this(tokenType, null, null, string, string.length()); + this(tokenType, null, null, string, string.length(), false); } public Token(TokenType tokenType, TemporalField temporalField, TemporalUnit temporalUnit, - String string, int length) { + String string, int length, boolean fillMode) { this.type = tokenType; this.temporalField = temporalField; this.temporalUnit = temporalUnit; this.string = string; this.length = length; + this.fillMode = fillMode; } - public String toString() { + @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(string); sb.append(" type: "); @@ -316,6 +346,11 @@ public String toString() { } return sb.toString(); } + + public void removeBackslashes() { + string = string.replaceAll("\\\\", ""); + length = string.length(); + } } public HiveSqlDateTimeFormatter(String pattern, boolean forParsing) { @@ -342,7 +377,7 @@ private void setPattern(String pattern, boolean forParsing) { /** * Parse pattern to list of tokens. */ - private String parsePatternToTokens(String pattern) { + private void parsePatternToTokens(String pattern) { tokens.clear(); String originalPattern = pattern; pattern = pattern.toLowerCase(); @@ -351,6 +386,7 @@ private String parsePatternToTokens(String pattern) { int begin=0, end=0; String candidate; Token lastAddedToken = null; + boolean fillMode = false; while (begin < pattern.length()) { // if begin hasn't progressed, then pattern is not parsable @@ -367,38 +403,41 @@ private String parsePatternToTokens(String pattern) { } candidate = pattern.substring(begin, end); if (isSeparator(candidate)) { - lastAddedToken = parseSeparatorToken(candidate, lastAddedToken); + lastAddedToken = parseSeparatorToken(candidate, lastAddedToken, fillMode, begin); begin = end; break; } - if (isIso8601Delimiter(candidate)) { - lastAddedToken = parseIso8601DelimiterToken(candidate); + if (isTemporalToken(candidate)) { + lastAddedToken = parseTemporalToken(originalPattern, candidate, fillMode, begin); + fillMode = false; begin = end; break; } - if (isTemporalToken(candidate)) { - lastAddedToken = parseTemporalToken(originalPattern, begin, candidate); + if (isTimeZoneToken(candidate)) { + lastAddedToken = parseTimeZoneToken(candidate, fillMode, begin); begin = end; break; } - if (isTimeZoneToken(candidate)) { - lastAddedToken = parseTimeZoneToken(candidate); + if (isTextToken(candidate)) { + lastAddedToken = parseTextToken(originalPattern, fillMode, begin); + end = begin + lastAddedToken.length + 2; // 2 quotation marks + begin = end; + lastAddedToken.removeBackslashes(); + break; + } + if (isFormatModifierToken(candidate)) { + fillMode = setOrGetFormatModifier(candidate, fillMode, begin); begin = end; break; } } } - return pattern; } private boolean isSeparator(String candidate) { return candidate.length() == 1 && VALID_SEPARATORS.contains(candidate); } - private boolean isIso8601Delimiter(String candidate) { - return candidate.length() == 1 && VALID_ISO_8601_DELIMITERS.contains(candidate); - } - private boolean isTemporalToken(String candidate) { return TEMPORAL_TOKENS.containsKey(candidate); } @@ -407,7 +446,17 @@ private boolean isTimeZoneToken(String pattern) { return TIME_ZONE_TOKENS.containsKey(pattern); } - private Token parseSeparatorToken(String candidate, Token lastAddedToken) { + private boolean isTextToken(String candidate) { + return candidate.startsWith("\""); + } + + private boolean isFormatModifierToken(String candidate) { + return candidate.length() == 2 && VALID_FORMAT_MODIFIERS.contains(candidate); + } + + private Token parseSeparatorToken(String candidate, Token lastAddedToken, boolean fillMode, + int begin) { + checkFillModeOff(fillMode, begin); // try to clump separator with immediately preceding separators (e.g. "---" counts as one // separator) if (lastAddedToken != null && lastAddedToken.type == TokenType.SEPARATOR) { @@ -420,35 +469,64 @@ private Token parseSeparatorToken(String candidate, Token lastAddedToken) { return lastAddedToken; } - private Token parseIso8601DelimiterToken(String candidate) { - Token lastAddedToken; - lastAddedToken = new Token(TokenType.ISO_8601_DELIMITER, candidate.toUpperCase()); - tokens.add(lastAddedToken); - return lastAddedToken; - } - - private Token parseTemporalToken(String originalPattern, int begin, String candidate) { - Token lastAddedToken; - + private Token parseTemporalToken(String originalPattern, String candidate, boolean fillMode, + int begin) { // for AM/PM, keep original case if (TEMPORAL_TOKENS.get(candidate) == ChronoField.AMPM_OF_DAY) { int subStringEnd = begin + candidate.length(); candidate = originalPattern.substring(begin, subStringEnd); } - lastAddedToken = new Token(TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, - getTokenStringLength(candidate.toLowerCase())); + Token lastAddedToken = new Token(TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, + getTokenStringLength(candidate.toLowerCase()), fillMode); tokens.add(lastAddedToken); return lastAddedToken; } - private Token parseTimeZoneToken(String candidate) { - Token lastAddedToken; - lastAddedToken = new Token(TIME_ZONE_TOKENS.get(candidate), candidate, + private Token parseTimeZoneToken(String candidate, boolean fillMode, int begin) { + checkFillModeOff(fillMode, begin); + Token lastAddedToken = new Token(TIME_ZONE_TOKENS.get(candidate), candidate, getTokenStringLength(candidate)); tokens.add(lastAddedToken); return lastAddedToken; } + private Token parseTextToken(String fullPattern, boolean fillMode, int begin) { + checkFillModeOff(fillMode, begin); + int end = begin; + do { + end = fullPattern.indexOf('\"', end + 1); + if (end == -1) { + throw new IllegalArgumentException( + "Missing closing double quote (\") opened at index " + begin); + } + // if double quote is escaped with a backslash, keep looking for the closing quotation mark + } while ("\\".equals(fullPattern.substring(end - 1, end))); + Token lastAddedToken = new Token(TokenType.TEXT, fullPattern.substring(begin + 1, end)); + tokens.add(lastAddedToken); + return lastAddedToken; + } + + /** + * @return true if FM. FX sets a field. + */ + private boolean setOrGetFormatModifier(String candidate, boolean fillMode, int begin) { + checkFillModeOff(fillMode, begin); + if ("fm".equals(candidate)) { + return true; + } else { //fx + formatExact = true; + return false; + } + } + + private void checkFillModeOff(boolean fillMode, int index) { + if (fillMode) { + throw new IllegalArgumentException("Bad date/time conversion pattern: " + pattern + + ". Error at index " + index + ": Fill mode modifier (FM) must " + + "be followed by a temporal token."); + } + } + private int getTokenStringLength(String candidate) { Integer length = SPECIAL_LENGTHS.get(candidate); if (length != null) { @@ -565,11 +643,9 @@ public String format(Timestamp ts) { throw new IllegalArgumentException(token.string.toUpperCase() + " not a valid format for " + "timestamp or date."); case SEPARATOR: + case TEXT: outputString = token.string; break; - case ISO_8601_DELIMITER: - outputString = token.string.toUpperCase(); - break; default: // won't happen } @@ -607,47 +683,34 @@ private String formatTemporal(int value, Token token) { /** * To match token.length, pad left with zeroes or truncate. + * Omit padding if fill mode (FM) modifier on. */ private String padOrTruncateNumericTemporal(Token token, String output) { - if (output.length() < token.length) { - output = StringUtils.leftPad(output, token.length, '0'); // pad left - } else if (output.length() > token.length) { - if (token.temporalField == ChronoField.NANO_OF_SECOND) { - output = output.substring(0, token.length); // truncate right - } else { - output = output.substring(output.length() - token.length); // truncate left + //exception + if (token.temporalField == ChronoField.NANO_OF_SECOND) { + output = StringUtils.leftPad(output, 9, '0'); // pad left to length 9 + if (output.length() > token.length) { + output = output.substring(0, token.length); // truncate right to size } - } - if (token.temporalField == ChronoField.NANO_OF_SECOND - && token.string.equalsIgnoreCase("ff")) { - output = output.replaceAll("0*$", ""); //truncate trailing 0's - if (output.isEmpty()) { - output = "0"; + if (token.string.equalsIgnoreCase("ff")) { + output = output.replaceAll("0*$", ""); //truncate trailing 0's } - } - return output; - } - /** - * Left here for timestamp with local time zone. - */ - private String formatTimeZone(TimeZone timeZone, LocalDateTime localDateTime, Token token) { - ZoneOffset offset = timeZone.toZoneId().getRules().getOffset(localDateTime); - Duration seconds = Duration.of(offset.get(ChronoField.OFFSET_SECONDS), ChronoUnit.SECONDS); - if (token.string.equals("tzh")) { - long hours = seconds.toHours(); - String s = (hours >= 0) ? "+" : "-"; - s += (Math.abs(hours) < 10) ? "0" : ""; - s += String.valueOf(Math.abs(hours)); - return s; + // the rule } else { - long minutes = Math.abs(seconds.toMinutes() % MINUTES_PER_HOUR); - String s = String.valueOf(minutes); - if (s.length() == 1) { - s = "0" + s; + if (output.length() < token.length && !token.fillMode) { + output = StringUtils.leftPad(output, token.length, '0'); // pad left + } else if (output.length() > token.length) { + output = output.substring(output.length() - token.length); // truncate left } - return s; + if (token.fillMode) { + output = output.replaceAll("^0*", ""); //truncate leading 0's + } + } + if (output.isEmpty()) { + output = "0"; } + return output; } public Timestamp parseTimestamp(String fullInput){ @@ -707,8 +770,8 @@ public Timestamp parseTimestamp(String fullInput){ case SEPARATOR: index = parseSeparator(fullInput, index, token); break; - case ISO_8601_DELIMITER: - index = parseIso8601Delimiter(fullInput, index, token); + case TEXT: + index = parseText(fullInput, index, token); default: //do nothing } @@ -728,9 +791,9 @@ public Date parseDate(String input){ } /** * Return the next substring to parse. Length is either specified or token.length, but a - * separator or an ISO-8601 delimiter can cut the substring short. (e.g. if the token pattern is - * "YYYY" we expect the next 4 characters to be 4 numbers. However, if it is "976/" then we - * return "976" because a separator cuts it short.) + * separator can cut the substring short. (e.g. if the token pattern is "YYYY" we expect the next + * 4 characters to be 4 numbers. However, if it is "976/" then we return "976" because a + * separator cuts it short.) */ private String getNextSubstring(String s, int begin, Token token) { return getNextSubstring(s, begin, begin + token.length, token); @@ -748,16 +811,11 @@ private String getNextSubstring(String s, int begin, int end, Token token) { return s; } } - for (String sep : VALID_SEPARATORS) { - if (s.contains(sep)) { - s = s.substring(0, s.indexOf(sep)); - } - } - // TODO this will cause problems with DAY (for example, Thursday starts with T) - for (String delimiter : VALID_ISO_8601_DELIMITERS) { - if (s.toLowerCase().contains(delimiter)) { - s = s.substring(0, s.toLowerCase().indexOf(delimiter)); - } + // next non-numeric character is a delimiter. Don't worry about AM/PM since we've already + // handled that case. + if ((token.type == TokenType.TEMPORAL || token.type == TokenType.TIMEZONE) + && s.matches(".*\\D.*")) { + s = s.split("\\D", 2)[0]; } return s; @@ -765,8 +823,11 @@ private String getNextSubstring(String s, int begin, int end, Token token) { /** * Get the integer value of a temporal substring. + * @throws IllegalArgumentException */ private int parseTemporal(String substring, Token token){ + checkFormatExact(substring, token); + // exceptions to the rule if (token.temporalField == ChronoField.AMPM_OF_DAY) { return substring.toLowerCase().startsWith("a") ? AM : PM; @@ -805,6 +866,23 @@ private int parseTemporal(String substring, Token token){ } } + /** + * @throws IllegalArgumentException + */ + private void checkFormatExact(String substring, Token token) { + // AM/PM defaults to length 4 but make it 2 for FX check if the pattern actually has length 2 + if (formatExact && token.temporalField == ChronoField.AMPM_OF_DAY) { + token.length = token.string.length(); + } + if (formatExact + && !(token.fillMode || token.temporalField == ChronoField.NANO_OF_SECOND) + && token.length != substring.length()) { + throw new IllegalArgumentException( + "FX on and expected token length " + token.length + " for token " + token.toString() + + " does not match substring (" + substring + ") length " + substring.length()); + } + } + /** * Parse the next separator(s). At least one separator character is expected. Separator * characters are interchangeable. @@ -814,38 +892,45 @@ private int parseTemporal(String substring, Token token){ * separator, UNLESS this is the only separator character in the separator substring (in * which case it is not counted as the negative sign). * - * @throws IllegalArgumentException if separator is missing + * @throws IllegalArgumentException if separator is missing or if FX is on and separator doesn't + * match the expected separator pattern exactly */ - private int parseSeparator(String fullInput, int index, Token token){ - int separatorsFound = 0; + private int parseSeparator(String fullInput, int index, Token token) { int begin = index; + String s; + StringBuilder separatorsFound = new StringBuilder(); while (index < fullInput.length() && VALID_SEPARATORS.contains(fullInput.substring(index, index + 1))) { + s = fullInput.substring(index, index + 1); if (!isLastCharacterOfSeparator(index, fullInput) - || !("-".equals(fullInput.substring(index, index + 1)) && (nextTokenIs("tzh", token))) - || separatorsFound == 0) { - separatorsFound++; + || !("-".equals(s) && (nextTokenIs("tzh", token))) + || separatorsFound.length() == 0) { + separatorsFound.append(s); } index++; } - if (separatorsFound == 0) { + if (separatorsFound.length() == 0) { throw new IllegalArgumentException("Missing separator at index " + index); } - return begin + separatorsFound; + if (formatExact && !token.string.equals(separatorsFound.toString())) { + throw new IllegalArgumentException("FX on and separator found: " + separatorsFound.toString() + + " doesn't match expected separator: " + token.string); + } + + return begin + separatorsFound.length(); } - private int parseIso8601Delimiter(String fullInput, int index, Token token) { + private int parseText(String fullInput, int index, Token token) { String substring; - substring = fullInput.substring(index, index + 1); - if (token.string.equalsIgnoreCase(substring)) { - index++; - } else { + substring = fullInput.substring(index, index + token.length); + if (!token.string.equalsIgnoreCase(substring)) { throw new IllegalArgumentException( - "Missing ISO 8601 delimiter " + token.string.toUpperCase()); + "Wrong input at index " + index + ": Expected: \"" + token.string + "\" but got: \"" + + substring + "\" for token: " + token); } - return index; + return index + token.length; } /** diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java index 4e822d53f9..37966ac07f 100644 --- common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java @@ -25,22 +25,10 @@ import java.time.LocalDate; import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeFormatterBuilder; -import java.time.format.ResolverStyle; -import java.time.format.SignStyle; import java.time.temporal.ChronoField; import java.time.temporal.TemporalField; import java.util.ArrayList; -import static java.time.temporal.ChronoField.DAY_OF_MONTH; -import static java.time.temporal.ChronoField.HOUR_OF_DAY; -import static java.time.temporal.ChronoField.MINUTE_OF_HOUR; -import static java.time.temporal.ChronoField.MONTH_OF_YEAR; -import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; -import static java.time.temporal.ChronoField.YEAR; - /** * Tests HiveSqlDateTimeFormatter. */ @@ -74,6 +62,7 @@ public void testSetPattern() { } public void testSetPatternWithBadPatterns() { + verifyBadPattern("", true); verifyBadPattern("eyyyy-ddd", true); verifyBadPattern("1yyyy-mm-dd", true); @@ -104,9 +93,9 @@ public void testSetPatternWithBadPatterns() { public void testFormatTimestamp() { checkFormatTs("rr rrrr ddd", "2018-01-03 00:00:00", "18 2018 003"); - checkFormatTs("yyyy-mm-ddtsssss.ff4z", "2018-02-03 00:00:10.777777777", "2018-02-03T00010.7777Z"); + checkFormatTs("yyyy-mm-dd sssss.ff4", "2018-02-03 00:00:10.777777777", "2018-02-03 00010.7777"); checkFormatTs("hh24:mi:ss.ff1", "2018-02-03 01:02:03.999999999", "01:02:03.9"); - checkFormatTs("y yyy hh:mi:ss.ffz", "2018-02-03 01:02:03.0070070", "8 018 01:02:03.007007Z"); + checkFormatTs("y yyy hh:mi:ss.ff", "2018-02-03 01:02:03.0070070", "8 018 01:02:03.007007"); checkFormatTs("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03 01:02:03.0070070", "am a.m. am a.m. AM A.M. AM A.M."); checkFormatTs("HH12 P.M.", "2019-01-01 00:15:10", "12 A.M."); checkFormatTs("HH12 AM", "2019-01-01 12:15:10", "12 PM"); @@ -115,14 +104,15 @@ public void testFormatTimestamp() { private void checkFormatTs(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, false); - assertEquals(expectedOutput, formatter.format(toTimestamp(input))); + assertEquals("Format timestamp to string failed with pattern: " + pattern, + expectedOutput, formatter.format(Timestamp.valueOf(input))); } public void testFormatDate() { checkFormatDate("rr rrrr ddd", "2018-01-03", "18 2018 003"); - checkFormatDate("yyyy-mm-ddtsssss.ff4z", "2018-02-03", "2018-02-03T00000.0000Z"); + checkFormatDate("yyyy-mm-dd sssss.ff4 ", "2018-02-03", "2018-02-03 00000.0000 "); checkFormatDate("hh24:mi:ss.ff1", "2018-02-03", "00:00:00.0"); - checkFormatDate("y yyy T hh:mi:ss.ff am z", "2018-02-03", "8 018 T 12:00:00.0 am Z"); + checkFormatDate("y yyy hh:mi:ss.ff am", "2018-02-03", "8 018 12:00:00.0 am"); checkFormatDate("am a.m. pm p.m. AM A.M. PM P.M.", "2018-02-03", "am a.m. am a.m. AM A.M. AM A.M."); checkFormatDate("DDD", "2019-12-31", "365"); checkFormatDate("DDD", "2020-12-31", "366"); @@ -130,7 +120,8 @@ public void testFormatDate() { private void checkFormatDate(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, false); - assertEquals(expectedOutput, formatter.format(toDate(input))); + assertEquals("Format date to string failed with pattern: " + pattern, + expectedOutput, formatter.format(Date.valueOf(input))); } public void testParseTimestamp() { @@ -156,12 +147,12 @@ public void testParseTimestamp() { checkParseTimestamp("rrrr-mm-dd", "99-02-03", firstTwoDigits + "99-02-03 00:00:00"); //everything else - checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ff8z", "2018-02-03T04:05:06.5665Z", "2018-02-03 04:05:06.5665"); + checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff8", "2018-02-03 04:05:06.5665", "2018-02-03 04:05:06.5665"); checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff", "2018-02-03 04:05:06.555555555", "2018-02-03 04:05:06.555555555"); checkParseTimestamp("yyyy-mm-dd hh12:mi:ss", "2099-2-03 04:05:06", "2099-02-03 04:05:06"); checkParseTimestamp("yyyyddd", "2018284", "2018-10-11 00:00:00"); checkParseTimestamp("yyyyddd", "20184", "2018-01-04 00:00:00"); - checkParseTimestamp("yyyy-mm-ddThh24:mi:ss.ffz", "2018-02-03t04:05:06.444Z", "2018-02-03 04:05:06.444"); + checkParseTimestamp("yyyy-mm-dd hh24:mi:ss.ff ", "2018-02-03 04:05:06.444 ", "2018-02-03 04:05:06.444"); checkParseTimestamp("yyyy-mm-dd hh:mi:ss A.M.", "2018-02-03 04:05:06 P.M.", "2018-02-03 16:05:06"); checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00--1:-30", "2019-01-01 14:00:00"); checkParseTimestamp("YYYY-MM-DD HH24:MI TZH:TZM", "2019-1-1 14:00-1:30", "2019-01-01 14:00:00"); @@ -202,7 +193,8 @@ private int getFirstTwoDigits() { private void checkParseTimestamp(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, true); - assertEquals(toTimestamp(expectedOutput), formatter.parseTimestamp(input)); + assertEquals("Parse string to timestamp failed. Pattern: " + pattern, + Timestamp.valueOf(expectedOutput), formatter.parseTimestamp(input)); } public void testParseDate() { @@ -232,15 +224,15 @@ public void testParseDate() { private void checkParseDate(String pattern, String input, String expectedOutput) { formatter = new HiveSqlDateTimeFormatter(pattern, true); - assertEquals(toDate(expectedOutput), formatter.parseDate(input)); + assertEquals("Parse string to date failed. Pattern: " + pattern, + Date.valueOf(expectedOutput), formatter.parseDate(input)); } public void testParseTimestampError() { - verifyBadParseString("yyyy", "2019-02-03"); verifyBadParseString("yyyy-mm-dd ", "2019-02-03"); //separator missing verifyBadParseString("yyyy-mm-dd", "2019-02-03..."); //extra separators verifyBadParseString("yyyy-mm-dd hh12:mi:ss", "2019-02-03 14:00:00"); //hh12 out of range - verifyBadParseString("yyyy-dddsssss", "2019-912345"); + verifyBadParseString("yyyy-dddsssss", "2019-912345"); //ddd out of range verifyBadParseString("yyyy-mm-dd", "2019-13-23"); //mm out of range verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +16:00"); //tzh out of range verifyBadParseString("yyyy-mm-dd tzh:tzm", "2019-01-01 +14:60"); //tzm out of range @@ -250,12 +242,99 @@ public void testParseTimestampError() { private void verifyBadPattern(String string, boolean forParsing) { try { formatter = new HiveSqlDateTimeFormatter(string, forParsing); - fail(); + fail("Bad pattern " + string + " should have thrown IllegalArgumentException but didn't"); } catch (Exception e) { - assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + assertEquals("Expected IllegalArgumentException, got another exception.", + e.getClass().getName(), IllegalArgumentException.class.getName()); } } + public void testFmFx() { + //fm + //year (019) becomes 19 even if pattern is yyy + checkFormatTs("FMyyy-FMmm-dd FMHH12:MI:FMSS", "2019-01-01 01:01:01", "19-1-01 1:01:1"); + //ff[1-9] shouldn't be affected, because leading zeroes hold information + checkFormatTs("FF5/FMFF5", "2019-01-01 01:01:01.0333", "03330/03330"); + checkFormatTs("FF/FMFF", "2019-01-01 01:01:01.0333", "0333/0333"); + //only affects temporals that immediately follow + verifyBadPattern("yyy-mm-dd FM,HH12", false); + verifyBadPattern("yyy-mm-dd FM,HH12", true); + verifyBadPattern("yyy-mm-dd HH12 tzh:fmtzm", true); + verifyBadPattern("FMFMyyy-mm-dd", true); + verifyBadPattern("FMFXDD-MM-YYYY ff2", true); + + //fx + checkParseDate("FXDD-MM-YYYY", "01-01-1998", "1998-01-01"); + checkParseTimestamp("FXDD-MM-YYYY hh12:mi:ss.ff", "15-01-1998 11:12:13.0", "1998-01-15 11:12:13"); + //ff[1-9] are exempt + checkParseTimestamp("FXDD-MM-YYYY hh12:mi:ss.ff6", "01-01-1998 00:00:00.4440", "1998-01-01 00:00:00.444"); + //fx can be anywhere in the pattern string + checkParseTimestamp("DD-MM-YYYYFX", "01-01-1998", "1998-01-01 00:00:00"); + verifyBadParseString("DD-MM-YYYYFX", "1-01-1998"); + //same separators required + verifyBadParseString("FXDD-MM-YYYY", "15/01/1998"); + //no filling in zeroes or year digits + verifyBadParseString("FXDD-MM-YYYY", "1-01-1998"); + verifyBadParseString("FXDD-MM-YYYY", "01-01-98"); + //no leading or trailing whitespace + verifyBadParseString("FXDD-MM-YYYY", " 01-01-1998 "); + //enforce correct amount of leading zeroes + verifyBadParseString("FXyyyy-mm-dd hh:miss PM", "2018-01-01 17:005 PM"); + verifyBadParseString("FXyyyy-mm-dd sssss", "2019-01-01 003"); + //text case does not matter + checkParseTimestamp("\"the DATE is\" yyyy-mm-dd", "the date is 2018-01-01", "2018-01-01 00:00:00"); + //AM/PM length has to match, but case doesn't + checkParseTimestamp("FXDD-MM-YYYY hh12 am", "01-01-1998 12 PM", "1998-01-01 12:00:00"); + checkParseTimestamp("FXDD-MM-YYYY hh12 A.M.", "01-01-1998 12 p.m.", "1998-01-01 12:00:00"); + verifyBadParseString("FXDD-MM-YYYY hh12 am", "01-01-1998 12 p.m."); + verifyBadParseString("FXDD-MM-YYYY hh12 a.m.", "01-01-1998 12 pm"); + + //fm modifies fx + checkParseTimestamp("FXDD-FMMM-YYYY hh12 am", "01-1-1998 12 PM", "1998-01-01 12:00:00"); + checkParseTimestamp("FXFMDD-MM-YYYY hh12 am", "1-01-1998 12 PM", "1998-01-01 12:00:00"); + //ff[1-9] unaffected + checkParseTimestamp("FXFMDD-MM-YYYY FMff2", "1-01-1998 4", "1998-01-01 00:00:00.4"); + checkParseTimestamp("FXFMDD-MM-YYYY ff2", "1-01-1998 4", "1998-01-01 00:00:00.4"); + } + + public void testText() { + // keep exact text upon format + checkFormatTs("hh24:mi \" Is \" hh12 PM\".\"", "2008-01-01 17:00:00", "17:00 Is 05 PM."); + checkFormatDate("\" `the _year_ is` \" yyyy\".\"", "2008-01-01", " `the _year_ is` 2008."); + // empty text strings work + checkParseTimestamp("\"\"yyyy\"\"-mm-dd\"\"", "2019-01-01", "2019-01-01 00:00:00"); + checkParseDate("\"\"yyyy\"\"-mm-dd\"\"", "2019-01-01", "2019-01-01"); + // Case doesn't matter upon parsing + checkParseTimestamp("\"Year \"YYYY \"month\" MM \"day\" DD.\"!\"", + "YEaR 3000 mOnTh 3 DaY 1...!", "3000-03-01 00:00:00"); + checkParseDate("\"Year \"YYYY \"month\" MM \"day\" DD.\"!\"", + "YEaR 3000 mOnTh 3 DaY 1...!", "3000-03-01"); + // Characters matter upon parsing + verifyBadParseString("\"Year! \"YYYY \"m\" MM \"d\" DD.\"!\"", "Year 3000 m 3 d 1,!"); + // non-numeric characters in text counts as a delimiter + checkParseDate("yyyy\"m\"mm\"d\"dd", "19m1d1", LocalDate.now().getYear() / 100 + "19-01-01"); + checkParseDate("yyyy\"[\"mm\"]\"dd", "19[1]1", LocalDate.now().getYear() / 100 + "19-01-01"); + + // single quotes are separators and not text delimiters + checkParseTimestamp("\"Y\'ear \"YYYY \' \"month\" MM \"day\" DD.\"!\"", + "Y'EaR 3000 ' mOnTh 3 DaY 1...!", "3000-03-01 00:00:00"); + checkParseDate("\"Y\'ear \"YYYY \' \"month\" MM \"day\" DD.\"!\"", + "Y'EaR 3000 ' mOnTh 3 DaY 1...!", "3000-03-01"); + // literal double quotes are escaped + checkFormatTs("\"the \\\"DATE\\\" is\" yyyy-mm-dd", + "2018-01-01 00:00:00", "the \"DATE\" is 2018-01-01"); + checkFormatTs("\"\\\"\\\"\\\"\"", "2018-01-01 00:00:00", "\"\"\""); + checkParseTimestamp("\"the \\\"DATE\\\" is\" yyyy-mm-dd", + "the \"date\" is 2018-01-01", "2018-01-01 00:00:00"); + // Check variations of apostrophes, literal and non-literal double quotes + checkParseTimestamp("yyyy'\"\"mm-dd", "2019\'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\"mm-dd", "2019\'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy'\"\"mm-dd", "2019'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\"mm-dd", "2019'01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\\\"\"mm-dd", "2019'\"01-01", "2019-01-01 00:00:00"); + checkParseTimestamp("yyyy\'\"\\\"\"mm-dd", "2019\'\"01-01", "2019-01-01 00:00:00"); + } + /** * Verify pattern is parsed correctly. * Check: @@ -268,7 +347,7 @@ private void verifyPatternParsing(String pattern, ArrayList tempo } private void verifyPatternParsing(String pattern, int expectedPatternLength, - String expectedPattern, ArrayList temporalFields) { + String expectedPattern, ArrayList temporalFields) { formatter = new HiveSqlDateTimeFormatter(pattern, false); assertEquals(temporalFields.size(), formatter.getTokens().size()); StringBuilder sb = new StringBuilder(); @@ -285,46 +364,14 @@ private void verifyPatternParsing(String pattern, int expectedPatternLength, } private void verifyBadParseString(String pattern, String string) { + formatter = new HiveSqlDateTimeFormatter(pattern, true); try { - formatter = new HiveSqlDateTimeFormatter(pattern, true); formatter.parseTimestamp(string); - fail(); + fail("Parse string to timestamp should have failed.\nString: " + string + "\nPattern: " + + pattern); } catch (Exception e) { - assertEquals(e.getClass().getName(), IllegalArgumentException.class.getName()); + assertEquals("Expected IllegalArgumentException, got another exception.", + e.getClass().getName(), IllegalArgumentException.class.getName()); } } - - - // Methods that construct datetime objects using java.time.DateTimeFormatter. - - public static Date toDate(String s) { - LocalDate localDate = LocalDate.parse(s, DATE_FORMATTER); - return Date.ofEpochDay((int) localDate.toEpochDay()); - } - - /** - * This is effectively the old Timestamp.valueOf method. - */ - public static Timestamp toTimestamp(String s) { - LocalDateTime localDateTime = LocalDateTime.parse(s.trim(), TIMESTAMP_FORMATTER); - return Timestamp.ofEpochSecond( - localDateTime.toEpochSecond(ZoneOffset.UTC), localDateTime.getNano()); - } - - private static final DateTimeFormatter DATE_FORMATTER = - DateTimeFormatter.ofPattern("yyyy-MM-dd"); - private static final DateTimeFormatter TIMESTAMP_FORMATTER; - static { - DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); - builder.appendValue(YEAR, 1, 10, SignStyle.NORMAL).appendLiteral('-') - .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NORMAL).appendLiteral('-') - .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NORMAL) - .optionalStart().appendLiteral(" ") - .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NORMAL).appendLiteral(':') - .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NORMAL).appendLiteral(':') - .appendValue(SECOND_OF_MINUTE, 1, 2, SignStyle.NORMAL) - .optionalStart().appendFraction(ChronoField.NANO_OF_SECOND, 1, 9, true).optionalEnd() - .optionalEnd(); - TIMESTAMP_FORMATTER = builder.toFormatter().withResolverStyle(ResolverStyle.LENIENT); - } } diff --git ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q index 269edf6da6..e2e56913e6 100644 --- ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q +++ ql/src/test/queries/clientpositive/cast_datetime_with_sql_2016_format.q @@ -37,6 +37,11 @@ from varchars select cast (s as date format "yyyy.mm.dd"); from chars select cast (s as timestamp format "yyyy.mm.dd"); from chars select cast (s as date format "yyyy.mm.dd"); +--quotation marks, apostrophes, and literal quotation marks are handled correctly +select +cast ("2019\' \' '' 01-01" as timestamp format "yyyy\'' \'' mm-dd"), +cast ("2019\"01-01" as timestamp format "yyyy\"\\\"\"mm-dd"), +cast ('2019\' " \' 01-01' as timestamp format 'yyyy\' "\\" \'" mm-dd'); --correct descriptions explain from strings select cast (s as timestamp format "yyy.mm.dd"); diff --git ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out index 4a502b9700..b9526f3b30 100644 --- ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out +++ ql/src/test/results/clientpositive/cast_datetime_with_sql_2016_format.q.out @@ -227,6 +227,21 @@ POSTHOOK: Input: default@chars #### A masked pattern was here #### 2020-02-03 1969-12-31 +PREHOOK: query: select +cast ("2019\' \' '' 01-01" as timestamp format "yyyy\'' \'' mm-dd"), +cast ("2019\"01-01" as timestamp format "yyyy\"\\\"\"mm-dd"), +cast ('2019\' " \' 01-01' as timestamp format 'yyyy\' "\\" \'" mm-dd') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select +cast ("2019\' \' '' 01-01" as timestamp format "yyyy\'' \'' mm-dd"), +cast ("2019\"01-01" as timestamp format "yyyy\"\\\"\"mm-dd"), +cast ('2019\' " \' 01-01' as timestamp format 'yyyy\' "\\" \'" mm-dd') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +2019-01-01 00:00:00 2019-01-01 00:00:00 2019-01-01 00:00:00 PREHOOK: query: explain from strings select cast (s as timestamp format "yyy.mm.dd") PREHOOK: type: QUERY PREHOOK: Input: default@strings