diff --git common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java index 1a0d7e6a27..4ae9674451 100644 --- common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java +++ common/src/java/org/apache/hadoop/hive/common/format/datetime/HiveSqlDateTimeFormatter.java @@ -24,6 +24,8 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.Timestamp; @@ -412,6 +414,7 @@ private final String pattern; private final List tokens; + private final List temporalTokens; // subset of tokens; only contains temporal ones private final Optional now; private boolean formatExact = false; @@ -477,6 +480,23 @@ IsoFields.WEEK_OF_WEEK_BASED_YEAR, IsoFields.WEEK_BASED_YEAR); + // give an order of precedence to date-based temporal units. When parsing and we assign the + // parsed values to the output timestamp, we want to assign the year first, then the month, etc. + private static final Map PRECEDENCE = + ImmutableMap.builder() + // YEAR: 1 + .put(ChronoField.YEAR, 1) + .put(IsoFields.WEEK_BASED_YEAR, 1) + // MONTH: 2 + .put(ChronoField.MONTH_OF_YEAR, 2) + // WEEK: 3 + .put(IsoFields.WEEK_OF_WEEK_BASED_YEAR, 4) + // DAY: 4 + .put(ChronoField.DAY_OF_MONTH, 4) + .put(ChronoField.DAY_OF_YEAR, 4) + .put(WeekFields.SUNDAY_START.dayOfWeek(), 4) + .build(); + /** * Represents broad categories of tokens. */ @@ -492,7 +512,7 @@ /** * Token representation. */ - public static class Token implements Serializable { + public static class Token implements Serializable, Comparable { private static final long serialVersionUID = 1L; @@ -502,6 +522,7 @@ String string; // pattern string, e.g. "yyy" int length; // length (e.g. YYY: 3, FF8: 8) boolean fillMode; //FM, applies to type TEMPORAL only (later should apply to TIMEZONE as well) + int precedence; // precedence of temporalField. See PRECEDENCE Map. public Token(TokenType tokenType, TemporalField temporalField, String string, int length, boolean fillMode) { @@ -524,6 +545,7 @@ public Token(TokenType tokenType, TemporalField temporalField, TemporalUnit temp this.string = string; this.length = length; this.fillMode = fillMode; + precedence = PRECEDENCE.getOrDefault(temporalField, Integer.MAX_VALUE); } @Override public String toString() { @@ -545,6 +567,16 @@ public void removeBackslashes() { string = string.replaceAll("\\\\", ""); length = string.length(); } + + @Override public int compareTo(Object o) { + if (o == null) { + throw new NullPointerException(); + } + if (!(o instanceof Token)) { + throw new IllegalArgumentException("Object " + o + " not an instance of Token."); + } + return Integer.compare(this.precedence, ((Token) o).precedence); + } } /** @@ -575,6 +607,7 @@ public HiveSqlDateTimeFormatter(final String pattern, final boolean forParsing) this.now = Objects.requireNonNull(now); this.tokens = new ArrayList<>(); + this.temporalTokens = new ArrayList<>(); Preconditions.checkArgument(pattern.length() < LONGEST_ACCEPTED_PATTERN, "The input format is too long"); @@ -728,6 +761,7 @@ private Token parseTemporalToken(String originalPattern, String candidate, boole NUMERIC_TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, getTokenStringLength(candidate), fillMode); tokens.add(lastAddedToken); + temporalTokens.add(lastAddedToken); return lastAddedToken; } @@ -740,6 +774,7 @@ private Token parseCharacterTemporalToken(String originalPattern, String candida CHARACTER_TEMPORAL_TOKENS.get(candidate.toLowerCase()), candidate, getTokenStringLength(candidate), fillMode); tokens.add(lastAddedToken); + temporalTokens.add(lastAddedToken); return lastAddedToken; } @@ -1040,12 +1075,12 @@ private String padOrTruncateNumericTemporal(Token token, String output) { } public Timestamp parseTimestamp(final String fullInput) { - LocalDateTime ldt = LocalDateTime.ofInstant(Instant.EPOCH, ZoneOffset.UTC); String substring; int index = 0; int value; int timeZoneHours = 0, timeZoneMinutes = 0; int iyyy = 0, iw = 0; + List temporalValues = new ArrayList<>(); for (Token token : tokens) { switch (token.type) { @@ -1058,12 +1093,7 @@ public Timestamp parseTimestamp(final String fullInput) { substring = getNextCharacterSubstring(fullInput, index, token); //e.g. Marcharch -> March value = parseCharacterTemporal(substring, token); // e.g. July->07 } - try { - ldt = ldt.with(token.temporalField, value); - } catch (DateTimeException e){ - throw new IllegalArgumentException( - "Value " + value + " not valid for token " + token); - } + temporalValues.add(value); //update IYYY and IW if necessary if (token.temporalField == IsoFields.WEEK_BASED_YEAR) { @@ -1119,14 +1149,23 @@ public Timestamp parseTimestamp(final String fullInput) { } } - // anything left unparsed at end of string? throw error + checkForLeftoverInput(fullInput, index); + + checkForInvalidIsoWeek(iyyy, iw); + + return getTimestampFromValues(temporalValues); + } + + /** + * Anything left unparsed at end of input string? Throw error. + * @param fullInput full input String + * @param index where we left off parsing + */ + private void checkForLeftoverInput(String fullInput, int index) { if (!fullInput.substring(index).isEmpty()) { throw new IllegalArgumentException("Leftover input after parsing: " + fullInput.substring(index) + " in string " + fullInput); } - checkForInvalidIsoWeek(iyyy, iw); - - return Timestamp.ofEpochSecond(ldt.toEpochSecond(ZoneOffset.UTC), ldt.getNano()); } /** @@ -1139,16 +1178,66 @@ private void checkForInvalidIsoWeek(int iyyy, int iw) { } LocalDateTime ldt = LocalDateTime.ofInstant(Instant.EPOCH, ZoneOffset.UTC); - ldt = ldt.with(IsoFields.WEEK_BASED_YEAR, iyyy); - ldt = ldt.with(IsoFields.WEEK_OF_WEEK_BASED_YEAR, iw); + try { + ldt = ldt.with(IsoFields.WEEK_BASED_YEAR, iyyy); + ldt = ldt.with(IsoFields.WEEK_OF_WEEK_BASED_YEAR, iw); + } catch (DateTimeException e) { + throw new IllegalArgumentException(e); + } + if (ldt.getYear() != iyyy) { throw new IllegalArgumentException("ISO year " + iyyy + " does not have " + iw + " weeks."); } } + /** + * Make a list of pairs: + * Left value: the tokens that represent a temporal value + * Right value: their corresponding values parsed from the input + * + * Sort this list by Token.precedence (see Map PRECEDENCE, Token#compareTo). + * Then create the parsed output Timestamp object. + * + * This point of this is to parse input like: "29.02.2000" (pattern "dd.mm.yyyy") + * correctly – if we assigned the day value to the timestamp before the year value, then + * output would be 2000-02-28. + * + * @param temporalValues list of integer values parsed from the input, in order of input + * @return the parsed Timestamp + */ + private Timestamp getTimestampFromValues(List temporalValues) { + // Make list of Token/int Pairs + if (temporalTokens.size() != temporalValues.size()) { + throw new IllegalStateException("temporalTokens list length (" + temporalTokens.size() + + ") differs from that of temporalValues (length: " + temporalValues.size() + ")"); + } + List tokensList = new ArrayList<>(); + for (int i = 0; i < temporalTokens.size(); i++) { + ImmutablePair pair = new ImmutablePair<>(temporalTokens.get(i), temporalValues.get(i)); + tokensList.add(pair); + } + + Collections.sort(tokensList); + + // Create Timestamp + LocalDateTime ldt = LocalDateTime.ofInstant(Instant.EPOCH, ZoneOffset.UTC); + for (Pair pair : tokensList) { + TemporalField tf = ((Token) pair.getLeft()).temporalField; + int val = (int) pair.getRight(); + try { + ldt = ldt.with(tf, val); + } catch (DateTimeException e){ + throw new IllegalArgumentException( + "Value " + val + " not valid for token " + tf); + } + } + return Timestamp.ofEpochSecond(ldt.toEpochSecond(ZoneOffset.UTC), ldt.getNano()); + } + public Date parseDate(String input){ return Date.ofEpochMilli(parseTimestamp(input).toEpochMilli()); } + /** * Return the next substring to parse. Length is either specified or token.length, but a * separator or an ISO-8601 delimiter can cut the substring short. (e.g. if the token pattern is diff --git common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java index 9c9b0bedcf..a07cb1f1b3 100644 --- common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java +++ common/src/test/org/apache/hadoop/hive/common/format/datetime/TestHiveSqlDateTimeFormatter.java @@ -251,6 +251,8 @@ public void testParseTimestamp() { checkParseTimestamp("YYYY DDD", "2000 60", "2000-02-29 00:00:00"); checkParseTimestamp("YYYY DDD", "2000 61", "2000-03-01 00:00:00"); checkParseTimestamp("YYYY DDD", "2000 366", "2000-12-31 00:00:00"); + //Leap day, parse day first + checkParseTimestamp("dd mm yyyy", "29 02 2000", "2000-02-29 00:00:00"); //Test timezone offset parsing without separators checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM+0515", "2018-12-31 08:00:00"); checkParseTimestamp("YYYYMMDDHH12MIA.M.TZHTZM", "201812310800AM0515", "2018-12-31 08:00:00"); @@ -533,8 +535,8 @@ private void verifyBadParseString(String pattern, String string) { fail("Parse string to timestamp should have failed.\nString: " + string + "\nPattern: " + pattern + ", output = " + output); } catch (Exception e) { - assertEquals("Expected IllegalArgumentException, got another exception.", - e.getClass().getName(), IllegalArgumentException.class.getName()); + assertEquals("Expected IllegalArgumentException, got another exception:" + e, + IllegalArgumentException.class.getName(), e.getClass().getName()); } } }