Description
The c++ TimestampColumnWriter will adjust timestamp with gmtOffset:
void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues) { ...... int64_t *secs = tsBatch->data.data() + offset; int64_t *nanos = tsBatch->nanoseconds.data() + offset; ...... bool hasNull = false; for (uint64_t i = 0; i < numValues; ++i) { if (notNull == nullptr || notNull[i]) { // TimestampVectorBatch already stores data in UTC int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; tsStats->increase(1); tsStats->update(millsUTC); secs[i] -= timezone.getVariant(secs[i]).gmtOffset; <-- should not adjust with gmtOffset here secs[i] -= timezone.getEpoch(); nanos[i] = formatNano(nanos[i]); } else if (!hasNull) { hasNull = true; } } tsStats->setHasNull(hasNull); secRleEncoder->add(secs, numValues, notNull); nanoRleEncoder->add(nanos, numValues, notNull); }
The java reader doesn't adjust this:
public void writeBatch(ColumnVector vector, int offset, int length) throws IOException { super.writeBatch(vector, offset, length); TimestampColumnVector vec = (TimestampColumnVector) vector; if (vector.isRepeating) { ...... } else { for (int i = 0; i < length; ++i) { if (vec.noNulls || !vec.isNull[i + offset]) { // ignore the bottom three digits from the vec.time field final long secs = vec.time[i + offset] / MILLIS_PER_SECOND; final int newNanos = vec.nanos[i + offset]; // set the millis based on the top three digits of the nanos long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000; if (millis < 0 && newNanos > 999_999) { millis -= MILLIS_PER_SECOND; } long utc = SerializationUtils.convertToUtc(localTimezone, millis); seconds.write(secs - baseEpochSecsLocalTz); <-- only adjust with ORC epoch nanos.write(formatNanos(newNanos)); indexStatistics.updateTimestamp(utc); if (createBloomFilter) { if (bloomFilter != null) { bloomFilter.addLong(millis); } bloomFilterUtf8.addLong(utc); } } } } }
This is a follow-up of ORC-320. I think there's a wrong assumption in c++ codes that timestamps given to the writer's TimestampVectorBatch are equal to timestamps got from the reader's TimestampVectorBatch (see codes in TestWriter::writeTimestamp). Noticed that there're gmtOffset processing logics in the c++ TimestampColumnPrinter.
To reveal the bug:
$ cat /etc/timezone America/Los_Angeles $ cat ts.csv # prepare a csv file with two timestamps 1520762399 1520762400 $ date -d @1520762399 Sun Mar 11 01:59:59 PST 2018 $ date -d @1520762400 Sun Mar 11 03:00:00 PDT 2018 $ build/tools/src/csv-import "struct<col0:timestamp>" ts.csv ts.orc # generate orc file by c++ writer [2018-03-14 21:41:58] Start importing Orc file... [2018-03-14 21:41:58] Finish importing Orc file. [2018-03-14 21:41:58] Total writer elasped time: 0.000475s. [2018-03-14 21:41:58] Total writer CPU time: 0.000474s. $ build/tools/src/orc-contents ts.orc # read by c++ reader, results are not we written {"col0": "2018-03-11 10:59:59.0"} {"col0": "2018-03-11 10:00:00.0"} $ java -jar java/tools/target/orc-tools-1.5.0-SNAPSHOT-uber.jar data ts.orc # read by java reader, results are the same as c++ reader Processing data file ts.orc [length: 257] {"col0":"2018-03-11 10:59:59.0"} {"col0":"2018-03-11 10:00:00.0"}
Since there're no tools importing timestamps as long, we can use java codes to generate a correct orc file:
public class TimestampWriter { public static void main(String[] args) throws IOException { TypeDescription schema = TypeDescription.fromString("struct<col0:timestamp>"); OrcFile.WriterOptions options = OrcFile.writerOptions(new Configuration()).setSchema(schema); Writer writer = OrcFile.createWriter(new Path("./ts_java.orc"), options); VectorizedRowBatch batch = schema.createRowBatch(); TimestampColumnVector tsc = (TimestampColumnVector) batch.cols[0]; tsc.time[0] = 1520762399000L; tsc.nanos[0] = 0; // time array is milliseconds tsc.time[1] = 1520762400000L; tsc.nanos[1] = 0; batch.size = 2; writer.addRowBatch(batch); writer.close(); } }
Run the codes to generate `ts_java.orc` and verify this orc file:
$ build/tools/src/orc-contents ts_java.orc {"col0": "2018-03-11 01:59:59.0"} # The correct results {"col0": "2018-03-11 03:00:00.0"} $ java -jar java/tools/target/orc-tools-1.5.0-SNAPSHOT-uber.jar data ts_java.orc Processing data file ts_java.orc [length: 251] {"col0":"2018-03-11 01:59:59.0"} # The correct results {"col0":"2018-03-11 03:00:00.0"}
Attachments
Issue Links
- incorporates
-
ORC-205 Include writer timezone in stripe footer
- Closed
- links to