commit 842ec0e790b23615e48793fa28d4e5d7d4e752c8 Author: Vihang Karajgaonkar Date: Sun Oct 22 16:34:35 2017 -0700 HIVE-17874 : Parquet vectorization fails on tables with complex columns when there are no projected columns diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 639ffa8a251eb1d58bfc3b8151f7c149a907818c..e2c59f2012f80937bc3ceb78868efbd05f7aa267 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -1343,6 +1343,7 @@ spark.query.files=add_part_multiple.q, \ vectorization_not.q, \ vectorization_part.q, \ vectorization_part_project.q, \ + vectorization_parquet_projection.q, \ vectorization_pushdown.q, \ vectorization_short_regress.q, \ vectorized_case.q, \ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index ecdd8bc215754ce4eea53423685a9a0532bbfbdc..e7d003387e8b4aaa02c2f1c21fa42baed93edbc9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -71,7 +71,6 @@ private List columnNamesList; private List columnTypesList; private VectorizedRowBatchCtx rbCtx; - private List indexColumnsWanted; /** * For each request column, the reader to read this column. This is NULL if this column @@ -103,7 +102,6 @@ public VectorizedParquetRecordReader( serDeStats = new SerDeStats(); projectionPusher = new ProjectionPusher(); initialize(inputSplit, conf); - colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf); rbCtx = Utilities.getVectorizedRowBatchCtx(conf); } catch (Throwable e) { LOG.error("Failed to create the vectorized reader due to exception " + e); @@ -121,7 +119,6 @@ public VectorizedParquetRecordReader( if (inputSplit != null) { initialize(inputSplit, conf); } - colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf); rbCtx = Utilities.getVectorizedRowBatchCtx(conf); } catch (Throwable e) { LOG.error("Failed to create the vectorized reader due to exception " + e); @@ -209,10 +206,10 @@ public void initialize( columnTypesList); } - indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); - if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) { + colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration); + if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !colsToInclude.isEmpty()) { requestedSchema = - DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted); + DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, colsToInclude); } else { requestedSchema = fileSchema; } @@ -291,11 +288,17 @@ private void checkEndOfRowGroup() throws IOException { List types = requestedSchema.getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; - if (!ColumnProjectionUtils.isReadAllColumns(jobConf) && !indexColumnsWanted.isEmpty()) { - for (int i = 0; i < types.size(); ++i) { - columnReaders[i] = - buildVectorizedParquetReader(columnTypesList.get(indexColumnsWanted.get(i)), types.get(i), - pages, requestedSchema.getColumns(), skipTimestampConversion, 0); + if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) { + //certain queries like select count(*) from table do not have + //any projected columns and still have isReadAllColumns as false + //in such cases columnReaders are not needed + //if there are colsToInclude initialize each columnReader + if(!colsToInclude.isEmpty()) { + for (int i = 0; i < types.size(); ++i) { + columnReaders[i] = + buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), + pages, requestedSchema.getColumns(), skipTimestampConversion, 0); + } } } else { for (int i = 0; i < types.size(); ++i) { @@ -324,7 +327,7 @@ private void checkEndOfRowGroup() throws IOException { } // Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema - private VectorizedColumnReader buildVectorizedParquetReader( + private VectorizedColumnReader buildVectorizedParquetReader( TypeInfo typeInfo, Type type, PageReadStore pages, diff --git a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q new file mode 100644 index 0000000000000000000000000000000000000000..71460979b3c489d97c80dff0dff406eb4b8be7c6 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q @@ -0,0 +1,75 @@ +set hive.fetch.task.conversion=none; +set hive.compute.query.using.stats=false; +set hive.vectorized.use.row.serde.deserialize=false; +set hive.vectorized.use.vector.serde.deserialize=false; +set hive.vectorized.execution.enabled=true; +set hive.vectorized.execution.reduce.enabled=true; +set hive.mapred.mode=nonstrict; +set hive.llap.cache.allow.synthetic.fileid=true; + +DROP TABLE IF EXISTS parquet_types_staging; + +CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':'; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging; + +-- test various number of projected columns + +DROP TABLE IF EXISTS parquet_project_test; + +CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET; + +insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1; + +insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2; + +insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3; + +-- no columns in the projection +explain vectorization select * from parquet_project_test; +select * from parquet_project_test; + +-- no columns in the projection, just count(*) +explain vectorization select count(*) from parquet_project_test; +select count(*) from parquet_project_test; + +-- project a primitive type +explain vectorization select cint, count(*) from parquet_project_test +group by cint; + +select cint, count(*) from parquet_project_test +group by cint; + +-- test complex type in projection, this should not get vectorized +explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"]; + +select m1["color"], count(*) from parquet_project_test +group by m1["color"]; diff --git a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out new file mode 100644 index 0000000000000000000000000000000000000000..911b6c1c356a17e70126eb3ac6895a8d84bb649e --- /dev/null +++ b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out @@ -0,0 +1,459 @@ +PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: explain vectorization select * from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select * from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int), m1 (type: map) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Data type map of Column[m1] not supported + vectorized: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +PREHOOK: query: explain vectorization select count(*) from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + groupByVectorOutput: true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +22 +PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int) + outputColumnNames: cint + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: cint (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + groupByVectorOutput: true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +2 7 +1 8 +3 7 +PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: m1['color'] (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Data type map of Column[m1] not supported + vectorized: false + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + groupByVectorOutput: true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +blue 7 +green 7 +red 8 diff --git a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out new file mode 100644 index 0000000000000000000000000000000000000000..02a28de3fcaa82c98cbac3395bb621d4dc6ac6c0 --- /dev/null +++ b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out @@ -0,0 +1,426 @@ +PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: explain vectorization select * from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select * from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int), m1 (type: map) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Data type map of Column[m1] not supported + vectorized: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +PREHOOK: query: explain vectorization select count(*) from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +22 +PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int) + outputColumnNames: cint + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: cint (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +1 8 +2 7 +3 7 +PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: m1['color'] (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Data type map of Column[m1] not supported + vectorized: false + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +blue 7 +green 7 +red 8