commit 52cc2ab404220ce624c3b49ef1357d8ddb53715b Author: Vihang Karajgaonkar Date: Fri Jan 12 17:54:48 2018 -0800 HIVE-18323 : Vectorization: add the support of timestamp in VectorizedPrimitiveColumnReader for parquet diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java index 5e577d2bf87bd1929900636a7f6baa0080c0735b..fbb51d6635b797f9d8eca780dbdf546e41e72741 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java @@ -30,6 +30,7 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.sql.Timestamp; +import java.time.Instant; /** * It's column level Parquet reader which is used to read a batch of records for a column, @@ -37,6 +38,9 @@ */ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader { + private static final long TEN_TO_POW_3 = 1000; + private static final long TEN_TO_POW_6 = 1000000L; + public VectorizedPrimitiveColumnReader( ColumnDescriptor descriptor, PageReader pageReader, @@ -110,8 +114,10 @@ private void readBatchHelper( case DECIMAL: readDecimal(num, (DecimalColumnVector) column, rowId); break; - case INTERVAL_DAY_TIME: case TIMESTAMP: + readTimestamp(num, (TimestampColumnVector) column, rowId); + break; + case INTERVAL_DAY_TIME: default: throw new IOException("Unsupported type: " + type); } @@ -288,6 +294,53 @@ private void readBinaries( } } + private void readTimestamp(int total, TimestampColumnVector c, int rowId) throws IOException { + int left = total; + while (left > 0) { + readRepetitionAndDefinitionLevels(); + if (definitionLevel >= maxDefLevel) { + switch (descriptor.getType()) { + case INT64: + long seconds = 0; + long nanoSeconds = 0; + switch (type.getOriginalType()) { + case TIMESTAMP_MILLIS: + long miliSeconds = dataColumn.readLong(); + seconds = miliSeconds / TEN_TO_POW_3; + nanoSeconds = (miliSeconds - seconds * TEN_TO_POW_3) * TEN_TO_POW_6; + break; + case TIMESTAMP_MICROS: + long microSeconds = dataColumn.readLong(); + seconds = microSeconds / TEN_TO_POW_6; + nanoSeconds = (microSeconds - seconds * TEN_TO_POW_6) * TEN_TO_POW_3; + break; + default: + throw new IOException( + "Unsupported parquet logical type: " + type.getOriginalType() + " for timestamp"); + } + c.set(rowId, Timestamp.from(Instant.ofEpochSecond(seconds, nanoSeconds))); + case INT96: + NanoTime nt = NanoTime.fromBinary(dataColumn.readBytes()); + Timestamp ts = NanoTimeUtils.getTimestamp(nt, skipTimestampConversion); + c.set(rowId, ts); + break; + default: + throw new IOException( + "Unsupported parquet logical type: " + type.getOriginalType() + " for timestamp"); + } + c.isNull[rowId] = false; + c.isRepeating = + c.isRepeating && ((c.time[0] == c.time[rowId]) && (c.nanos[0] == c.nanos[rowId])); + } else { + c.isNull[rowId] = true; + c.isRepeating = false; + c.noNulls = false; + } + rowId++; + left--; + } + } + /** * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`. */ diff --git a/ql/src/test/queries/clientpositive/vectorized_parquet_types.q b/ql/src/test/queries/clientpositive/vectorized_parquet_types.q index 63f811b802b3d4af4c9c145298eca2726573cfb2..b122103aa99e7c2761264cdddae221cd6e2a66ff 100644 --- a/ql/src/test/queries/clientpositive/vectorized_parquet_types.q +++ b/ql/src/test/queries/clientpositive/vectorized_parquet_types.q @@ -4,6 +4,7 @@ set hive.llap.cache.allow.synthetic.fileid=true; DROP TABLE parquet_types_staging; DROP TABLE parquet_types; +DROP TABLE IF EXISTS parquet_type_nodict; -- init CREATE TABLE parquet_types_staging ( @@ -84,3 +85,42 @@ SELECT ctinyint, FROM parquet_types GROUP BY ctinyint ORDER BY ctinyint; + +-- test with dictionary encoding disabled +create table parquet_type_nodict like parquet_types +stored as parquet tblproperties ("parquet.enable.dictionary"="false"); + +insert into parquet_type_nodict +select * from parquet_types; + + +explain vectorization expression +SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict; + +SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict; + +explain vectorization expression +SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict; + +SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict; + +-- test timestamp vectorization +explain vectorization select max(t), min(t) from parquet_type_nodict; +select max(t), min(t) from parquet_type_nodict; + +-- test timestamp columnVector isRepeating +create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false"); + +insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456'); + +set hive.fetch.task.conversion=none; +select ts from test where id > 1; + +-- test null values in timestamp +insert into test values (3, NULL); +select ts from test where id > 1; + +DROP TABLE parquet_type_nodict; +DROP TABLE test; diff --git a/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out b/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out index 489ae42013b701089d7da966f234ba87f77c7048..05e34d6fa483e5080c97cedfa0bb55c1ebe43735 100644 --- a/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorized_parquet_types.q.out @@ -6,6 +6,10 @@ PREHOOK: query: DROP TABLE parquet_types PREHOOK: type: DROPTABLE POSTHOOK: query: DROP TABLE parquet_types POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict +POSTHOOK: type: DROPTABLE PREHOOK: query: CREATE TABLE parquet_types_staging ( cint int, ctinyint tinyint, @@ -415,3 +419,299 @@ POSTHOOK: Input: default@parquet_types 1 121 1 8 1.1749999970197678 2.0621590627301285 90.33 2 119 1 7 1.2142857142857142 1.8 60.12 3 120 1 7 1.171428578240531 1.7999999999999996 90.21 +PREHOOK: query: create table parquet_type_nodict like parquet_types +stored as parquet tblproperties ("parquet.enable.dictionary"="false") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_type_nodict +POSTHOOK: query: create table parquet_type_nodict like parquet_types +stored as parquet tblproperties ("parquet.enable.dictionary"="false") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_type_nodict +PREHOOK: query: insert into parquet_type_nodict +select * from parquet_types +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types +PREHOOK: Output: default@parquet_type_nodict +POSTHOOK: query: insert into parquet_type_nodict +select * from parquet_types +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types +POSTHOOK: Output: default@parquet_type_nodict +POSTHOOK: Lineage: parquet_type_nodict.cbinary SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cbinary, type:binary, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cchar, type:char(5), comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cdecimal SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdecimal, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cdouble SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdouble, type:double, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cfloat SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cfloat, type:float, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.csmallint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:csmallint, type:smallint, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cstring1 SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cstring1, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.ctinyint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:ctinyint, type:tinyint, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cvarchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cvarchar, type:varchar(10), comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.t SIMPLE [(parquet_types)parquet_types.FieldSchema(name:t, type:timestamp, comment:null), ] +PREHOOK: query: explain vectorization expression +SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization expression +SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: parquet_type_nodict + Select Operator + expressions: cint (type: int), ctinyint (type: tinyint), csmallint (type: smallint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), t (type: timestamp), cchar (type: char(5)), cvarchar (type: varchar(10)), hex(cbinary) (type: string), cdecimal (type: decimal(4,2)) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + ListSink + +PREHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +POSTHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 a a B4F3CAFDBEDD 48.88 +101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 ab ab 68692CCAC0BDE7 8.72 +102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 abc abc B4F3CAFDBEDD 90.21 +103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 abcd abcd 68692CCAC0BDE7 3.89 +104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 abcde abcde B4F3CAFDBEDD 56.23 +105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 abcde abcdef 68692CCAC0BDE7 90.21 +106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 abcde abcdefg B4F3CAFDBEDD 6.09 +107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 bcdef abcdefgh 68692CCAC0BDE7 9.44 +108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 cdefg B4F3CAFDBE 68656C6C6F 77.54 +109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 klmno abcdedef 68692CCAC0BDE7 25.42 +110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 pqrst abcdede B4F3CAFDBEDD 60.12 +111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 nopqr abcded 68692CCAC0BDE7 49.56 +112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 opqrs abcdd B4F3CAFDBEDD 80.76 +113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 pqrst abc 68692CCAC0BDE7 23.23 +114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 qrstu b B4F3CAFDBEDD 1.01 +115 1 1 1.0 4.5 qrs 2026-04-04 16:16:16.161616161 rstuv abcded 68692CCAC0BDE7 5.98 +116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 stuvw abcded B4F3CAFDBEDD 11.22 +117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 tuvwx abcded 68692CCAC0BDE7 9.88 +118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 uvwzy abcdede B4F3CAFDBEDD 4.76 +119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 vwxyz abcdede 68692CCAC0BDE7 12.83 +120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 wxyza abcde B4F3CAFDBEDD 73.04 +121 1 2 1.1 6.3 lmn 2032-10-10 22:22:22.222222222 bcdef abcde 90.33 +PREHOOK: query: explain vectorization expression +SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization expression +SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: parquet_type_nodict + Select Operator + expressions: cchar (type: char(5)), length(cchar) (type: int), cvarchar (type: varchar(10)), length(cvarchar) (type: int), cdecimal (type: decimal(4,2)), sign(cdecimal) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + ListSink + +PREHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +POSTHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +a 1 a 3 48.88 1 +ab 2 ab 3 8.72 1 +abc 3 abc 3 90.21 1 +abcd 4 abcd 4 3.89 1 +abcde 5 abcde 5 56.23 1 +abcde 5 abcdef 6 90.21 1 +abcde 5 abcdefg 7 6.09 1 +bcdef 5 abcdefgh 8 9.44 1 +cdefg 5 B4F3CAFDBE 10 77.54 1 +klmno 5 abcdedef 8 25.42 1 +pqrst 5 abcdede 7 60.12 1 +nopqr 5 abcded 6 49.56 1 +opqrs 5 abcdd 5 80.76 1 +pqrst 5 abc 3 23.23 1 +qrstu 5 b 1 1.01 1 +rstuv 5 abcded 6 5.98 1 +stuvw 5 abcded 6 11.22 1 +tuvwx 5 abcded 6 9.88 1 +uvwzy 5 abcdede 7 4.76 1 +vwxyz 5 abcdede 7 12.83 1 +wxyza 5 abcde 5 73.04 1 +bcdef 5 abcde 5 90.33 1 +PREHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_type_nodict + Statistics: Num rows: 22 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: t (type: timestamp) + outputColumnNames: t + Statistics: Num rows: 22 Data size: 880 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(t), min(t) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: timestamp), _col1 (type: timestamp) + Execution mode: vectorized, llap + LLAP IO: all inputs (cache only) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(t), min(t) from parquet_type_nodict +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +POSTHOOK: query: select max(t), min(t) from parquet_type_nodict +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +2032-10-10 22:22:22.222222222 2011-01-01 01:01:01.111111111 +PREHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test +POSTHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test +PREHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test +POSTHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test +POSTHOOK: Lineage: test.id SCRIPT [] +POSTHOOK: Lineage: test.ts SCRIPT [] +PREHOOK: query: select ts from test where id > 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: select ts from test where id > 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +2019-01-01 23:12:45.123456 +2019-01-01 23:12:45.123456 +PREHOOK: query: insert into test values (3, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test +POSTHOOK: query: insert into test values (3, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test +POSTHOOK: Lineage: test.id SCRIPT [] +POSTHOOK: Lineage: test.ts EXPRESSION [] +PREHOOK: query: select ts from test where id > 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: select ts from test where id > 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +2019-01-01 23:12:45.123456 +2019-01-01 23:12:45.123456 +NULL +PREHOOK: query: DROP TABLE parquet_type_nodict +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_type_nodict +PREHOOK: Output: default@parquet_type_nodict +POSTHOOK: query: DROP TABLE parquet_type_nodict +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_type_nodict +POSTHOOK: Output: default@parquet_type_nodict +PREHOOK: query: DROP TABLE test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test +PREHOOK: Output: default@test +POSTHOOK: query: DROP TABLE test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test +POSTHOOK: Output: default@test diff --git a/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out b/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out index 1a08d4627ac3600800a4189f6d9f62e5740386dc..0dc582f77fa716b8919c4626b5d821025b654fe5 100644 --- a/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out +++ b/ql/src/test/results/clientpositive/vectorized_parquet_types.q.out @@ -6,6 +6,10 @@ PREHOOK: query: DROP TABLE parquet_types PREHOOK: type: DROPTABLE POSTHOOK: query: DROP TABLE parquet_types POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_type_nodict +POSTHOOK: type: DROPTABLE PREHOOK: query: CREATE TABLE parquet_types_staging ( cint int, ctinyint tinyint, @@ -478,3 +482,356 @@ POSTHOOK: Input: default@parquet_types 1 121 1 8 1.1749999970197678 2.0621590627301285 90.33 2 119 1 7 1.2142857142857142 1.8 60.12 3 120 1 7 1.171428578240531 1.7999999999999996 90.21 +PREHOOK: query: create table parquet_type_nodict like parquet_types +stored as parquet tblproperties ("parquet.enable.dictionary"="false") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_type_nodict +POSTHOOK: query: create table parquet_type_nodict like parquet_types +stored as parquet tblproperties ("parquet.enable.dictionary"="false") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_type_nodict +PREHOOK: query: insert into parquet_type_nodict +select * from parquet_types +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types +PREHOOK: Output: default@parquet_type_nodict +POSTHOOK: query: insert into parquet_type_nodict +select * from parquet_types +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types +POSTHOOK: Output: default@parquet_type_nodict +POSTHOOK: Lineage: parquet_type_nodict.cbinary SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cbinary, type:binary, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cchar, type:char(5), comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cdecimal SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdecimal, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cdouble SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cdouble, type:double, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cfloat SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cfloat, type:float, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.csmallint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:csmallint, type:smallint, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cstring1 SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cstring1, type:string, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.ctinyint SIMPLE [(parquet_types)parquet_types.FieldSchema(name:ctinyint, type:tinyint, comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.cvarchar SIMPLE [(parquet_types)parquet_types.FieldSchema(name:cvarchar, type:varchar(10), comment:null), ] +POSTHOOK: Lineage: parquet_type_nodict.t SIMPLE [(parquet_types)parquet_types.FieldSchema(name:t, type:timestamp, comment:null), ] +PREHOOK: query: explain vectorization expression +SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization expression +SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_type_nodict + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + Select Operator + expressions: cint (type: int), ctinyint (type: tinyint), csmallint (type: smallint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), t (type: timestamp), cchar (type: char(5)), cvarchar (type: varchar(10)), hex(cbinary) (type: string), cdecimal (type: decimal(4,2)) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 10] + selectExpressions: VectorUDFAdaptor(hex(cbinary)) -> 12:string + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: true + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +POSTHOOK: query: SELECT cint, ctinyint, csmallint, cfloat, cdouble, cstring1, t, cchar, cvarchar, +hex(cbinary), cdecimal FROM parquet_type_nodict +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +100 1 1 1.0 0.0 abc 2011-01-01 01:01:01.111111111 a a B4F3CAFDBEDD 48.88 +101 2 2 1.1 0.3 def 2012-02-02 02:02:02.222222222 ab ab 68692CCAC0BDE7 8.72 +102 3 3 1.2 0.6 ghi 2013-03-03 03:03:03.333333333 abc abc B4F3CAFDBEDD 90.21 +103 1 4 1.3 0.9 jkl 2014-04-04 04:04:04.444444444 abcd abcd 68692CCAC0BDE7 3.89 +104 2 5 1.4 1.2 mno 2015-05-05 05:05:05.555555555 abcde abcde B4F3CAFDBEDD 56.23 +105 3 1 1.0 1.5 pqr 2016-06-06 06:06:06.666666666 abcde abcdef 68692CCAC0BDE7 90.21 +106 1 2 1.1 1.8 stu 2017-07-07 07:07:07.777777777 abcde abcdefg B4F3CAFDBEDD 6.09 +107 2 3 1.2 2.1 vwx 2018-08-08 08:08:08.888888888 bcdef abcdefgh 68692CCAC0BDE7 9.44 +108 3 4 1.3 2.4 yza 2019-09-09 09:09:09.999999999 cdefg B4F3CAFDBE 68656C6C6F 77.54 +109 1 5 1.4 2.7 bcd 2020-10-10 10:10:10.101010101 klmno abcdedef 68692CCAC0BDE7 25.42 +110 2 1 1.0 3.0 efg 2021-11-11 11:11:11.111111111 pqrst abcdede B4F3CAFDBEDD 60.12 +111 3 2 1.1 3.3 hij 2022-12-12 12:12:12.121212121 nopqr abcded 68692CCAC0BDE7 49.56 +112 1 3 1.2 3.6 klm 2023-01-02 13:13:13.131313131 opqrs abcdd B4F3CAFDBEDD 80.76 +113 2 4 1.3 3.9 nop 2024-02-02 14:14:14.141414141 pqrst abc 68692CCAC0BDE7 23.23 +114 3 5 1.4 4.2 qrs 2025-03-03 15:15:15.151515151 qrstu b B4F3CAFDBEDD 1.01 +115 1 1 1.0 4.5 qrs 2026-04-04 16:16:16.161616161 rstuv abcded 68692CCAC0BDE7 5.98 +116 2 2 1.1 4.8 wxy 2027-05-05 17:17:17.171717171 stuvw abcded B4F3CAFDBEDD 11.22 +117 3 3 1.2 5.1 zab 2028-06-06 18:18:18.181818181 tuvwx abcded 68692CCAC0BDE7 9.88 +118 1 4 1.3 5.4 cde 2029-07-07 19:19:19.191919191 uvwzy abcdede B4F3CAFDBEDD 4.76 +119 2 5 1.4 5.7 fgh 2030-08-08 20:20:20.202020202 vwxyz abcdede 68692CCAC0BDE7 12.83 +120 3 1 1.0 6.0 ijk 2031-09-09 21:21:21.212121212 wxyza abcde B4F3CAFDBEDD 73.04 +121 1 2 1.1 6.3 lmn 2032-10-10 22:22:22.222222222 bcdef abcde 90.33 +PREHOOK: query: explain vectorization expression +SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization expression +SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_type_nodict + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + Select Operator + expressions: cchar (type: char(5)), length(cchar) (type: int), cvarchar (type: varchar(10)), length(cvarchar) (type: int), cdecimal (type: decimal(4,2)), sign(cdecimal) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [7, 12, 8, 13, 10, 14] + selectExpressions: StringLength(col 7:char(5)) -> 12:int, StringLength(col 8:varchar(10)) -> 13:int, FuncSignDecimalToLong(col 10:decimal(4,2)) -> 14:int + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +POSTHOOK: query: SELECT cchar, LENGTH(cchar), cvarchar, LENGTH(cvarchar), cdecimal, SIGN(cdecimal) FROM parquet_type_nodict +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +a 1 a 3 48.88 1 +ab 2 ab 3 8.72 1 +abc 3 abc 3 90.21 1 +abcd 4 abcd 4 3.89 1 +abcde 5 abcde 5 56.23 1 +abcde 5 abcdef 6 90.21 1 +abcde 5 abcdefg 7 6.09 1 +bcdef 5 abcdefgh 8 9.44 1 +cdefg 5 B4F3CAFDBE 10 77.54 1 +klmno 5 abcdedef 8 25.42 1 +pqrst 5 abcdede 7 60.12 1 +nopqr 5 abcded 6 49.56 1 +opqrs 5 abcdd 5 80.76 1 +pqrst 5 abc 3 23.23 1 +qrstu 5 b 1 1.01 1 +rstuv 5 abcded 6 5.98 1 +stuvw 5 abcded 6 11.22 1 +tuvwx 5 abcded 6 9.88 1 +uvwzy 5 abcdede 7 4.76 1 +vwxyz 5 abcdede 7 12.83 1 +wxyza 5 abcde 5 73.04 1 +bcdef 5 abcde 5 90.33 1 +PREHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select max(t), min(t) from parquet_type_nodict +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_type_nodict + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t (type: timestamp) + outputColumnNames: t + Statistics: Num rows: 22 Data size: 242 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(t), min(t) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: timestamp), _col1 (type: timestamp) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 80 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(t), min(t) from parquet_type_nodict +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +POSTHOOK: query: select max(t), min(t) from parquet_type_nodict +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_type_nodict +#### A masked pattern was here #### +2032-10-10 22:22:22.222222222 2011-01-01 01:01:01.111111111 +PREHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test +POSTHOOK: query: create table test (id int, ts timestamp) stored as parquet tblproperties ("parquet.enable.dictionary"="false") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test +PREHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test +POSTHOOK: query: insert into test values (1, '2019-01-01 23:12:45.123456'), (2, '2019-01-01 23:12:45.123456'), (3, '2019-01-01 23:12:45.123456') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test +POSTHOOK: Lineage: test.id SCRIPT [] +POSTHOOK: Lineage: test.ts SCRIPT [] +PREHOOK: query: select ts from test where id > 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: select ts from test where id > 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +2019-01-01 23:12:45.123456 +2019-01-01 23:12:45.123456 +PREHOOK: query: insert into test values (3, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test +POSTHOOK: query: insert into test values (3, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test +POSTHOOK: Lineage: test.id SCRIPT [] +POSTHOOK: Lineage: test.ts EXPRESSION [] +PREHOOK: query: select ts from test where id > 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: select ts from test where id > 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +2019-01-01 23:12:45.123456 +2019-01-01 23:12:45.123456 +NULL +PREHOOK: query: DROP TABLE parquet_type_nodict +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@parquet_type_nodict +PREHOOK: Output: default@parquet_type_nodict +POSTHOOK: query: DROP TABLE parquet_type_nodict +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@parquet_type_nodict +POSTHOOK: Output: default@parquet_type_nodict +PREHOOK: query: DROP TABLE test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test +PREHOOK: Output: default@test +POSTHOOK: query: DROP TABLE test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test +POSTHOOK: Output: default@test