Description
When the writer time zone is not UTC, then the statistics for timestamp type is incorrect.
Minimal example to reproduce:
#include "orc/OrcFile.hh" int main() { std::unique_ptr<orc::Type> type(orc::Type::buildTypeFromString("struct<x:int,y:timestamp>")); std::unique_ptr<orc::OutputStream> outStream = orc::writeLocalFile("./test.orc"); orc::WriterOptions options; options.setTimezoneName("Asia/Shanghai"); std::unique_ptr<orc::Writer> writer = createWriter(*type, outStream.get(), options); std::unique_ptr<orc::ColumnVectorBatch> batch = writer->createRowBatch(1); orc::StructVectorBatch *root = dynamic_cast<orc::StructVectorBatch *>(batch.get()); orc::LongVectorBatch *x = dynamic_cast<orc::LongVectorBatch *>(root->fields[0]); orc::TimestampVectorBatch *y = dynamic_cast<orc::TimestampVectorBatch *>(root->fields[1]); x->data[0] = 1; y->data[0] = 1650133963; // 2022-04-16T18:32:43.3210+00:00 y->nanoseconds[0] = 321000000; x->numElements = 1; y->numElements = 1; root->numElements = 1; writer->add(*batch); writer->close(); return 0; }
Statistics:
# bin/orc-statistics test.orc File test.orc has 3 columns *** Column 0 *** Column has 1 values and has null value: no *** Column 1 *** Data type: Integer Values: 1 Has null: no Minimum: 1 Maximum: 1 Sum: 1*** Column 2 *** Data type: Timestamp Values: 1 Has null: no Minimum: 2022-04-16 18:33:12.121 LowerBound: 2022-04-16 18:33:12.121 Maximum: 2022-04-16 18:33:12.121 UpperBound: 2022-04-16 18:33:12.122 File test.orc has 1 stripes *** Stripe 0 *** --- Column 0 --- Column has 1 values and has null value: no --- Column 1 --- Data type: Integer Values: 1 Has null: no Minimum: 1 Maximum: 1 Sum: 1 --- Column 2 --- Data type: Timestamp Values: 1 Has null: no Minimum: 2022-04-16 18:33:12.121 LowerBound: 2022-04-16 18:33:12.121 Maximum: 2022-04-16 18:33:12.121 UpperBound: 2022-04-16 18:33:12.122
Content:
# bin/orc-contents test.orc {"x": 1, "y": "2022-04-17 02:32:43.321"}
Attachments
Issue Links
- links to