commit 3d743f6b25efa9489d373a6d3cc066d144c2e297 Author: Vihang Karajgaonkar Date: Sun Oct 22 16:34:35 2017 -0700 HIVE-17874 : Parquet vectorization fails on tables with complex columns when there are no projected columns diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 06ebc98d1af91b490af8a21eaffef696880b6f96..52c19b53c5831fbb7cb28a68a1aa951547f9bbab 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -1408,6 +1408,7 @@ spark.query.files=add_part_multiple.q, \ vectorization_not.q, \ vectorization_part.q, \ vectorization_part_project.q, \ + vectorization_parquet_projection.q, \ vectorization_pushdown.q, \ vectorization_short_regress.q, \ vectorized_case.q, \ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index 09777599a7eb53e790c19d6f13909ddf80abd1bf..8715aac7efa5185044f3103419cb230d3dbea922 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -93,7 +93,6 @@ private List columnNamesList; private List columnTypesList; private VectorizedRowBatchCtx rbCtx; - private List indexColumnsWanted; private Object[] partitionValues; private Path cacheFsPath; @@ -127,8 +126,6 @@ public VectorizedParquetRecordReader( serDeStats = new SerDeStats(); projectionPusher = new ProjectionPusher(); initialize(inputSplit, conf); - colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf); - rbCtx = Utilities.getVectorizedRowBatchCtx(conf); } catch (Throwable e) { LOG.error("Failed to create the vectorized reader due to exception " + e); throw new RuntimeException(e); @@ -153,8 +150,6 @@ public VectorizedParquetRecordReader( if (inputSplit != null) { initialize(inputSplit, conf); } - colsToInclude = ColumnProjectionUtils.getReadColumnIDs(conf); - rbCtx = Utilities.getVectorizedRowBatchCtx(conf); initPartitionValues((FileSplit) oldInputSplit, conf); } catch (Throwable e) { LOG.error("Failed to create the vectorized reader due to exception " + e); @@ -270,10 +265,10 @@ public void initialize( columnTypesList); } - indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); - if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) { + colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration); + if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !colsToInclude.isEmpty()) { requestedSchema = - DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted); + DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, colsToInclude); } else { requestedSchema = fileSchema; } @@ -281,6 +276,8 @@ public void initialize( Path path = wrapPathForCache(file, cacheKey, configuration, blocks); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); + //initialize the rowbatchContext + rbCtx = Utilities.getVectorizedRowBatchCtx(jobConf); } private Path wrapPathForCache(Path path, Object fileKey, JobConf configuration, @@ -453,11 +450,17 @@ private void checkEndOfRowGroup() throws IOException { List types = requestedSchema.getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; - if (!ColumnProjectionUtils.isReadAllColumns(jobConf) && !indexColumnsWanted.isEmpty()) { - for (int i = 0; i < types.size(); ++i) { - columnReaders[i] = - buildVectorizedParquetReader(columnTypesList.get(indexColumnsWanted.get(i)), types.get(i), - pages, requestedSchema.getColumns(), skipTimestampConversion, 0); + if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) { + //certain queries like select count(*) from table do not have + //any projected columns and still have isReadAllColumns as false + //in such cases columnReaders are not needed + //However, if colsToInclude is not empty we should initialize each columnReader + if(!colsToInclude.isEmpty()) { + for (int i = 0; i < types.size(); ++i) { + columnReaders[i] = + buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), + pages, requestedSchema.getColumns(), skipTimestampConversion, 0); + } } } else { for (int i = 0; i < types.size(); ++i) { diff --git a/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q new file mode 100644 index 0000000000000000000000000000000000000000..8865c797419bc2b85bc301007ddaa2bb3643822c --- /dev/null +++ b/ql/src/test/queries/clientpositive/vectorization_parquet_projection.q @@ -0,0 +1,79 @@ +set hive.fetch.task.conversion=none; +set hive.compute.query.using.stats=false; +set hive.vectorized.use.row.serde.deserialize=false; +set hive.vectorized.use.vector.serde.deserialize=false; +set hive.vectorized.execution.enabled=true; +set hive.vectorized.execution.reduce.enabled=true; +set hive.mapred.mode=nonstrict; +set hive.llap.cache.allow.synthetic.fileid=true; +set hive.vectorized.groupby.complex.types.enabled=false; +set hive.vectorized.complex.types.enabled=false; + +-- SORT_QUERY_RESULTS + +DROP TABLE IF EXISTS parquet_types_staging; + +CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':'; + +LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging; + +-- test various number of projected columns + +DROP TABLE IF EXISTS parquet_project_test; + +CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET; + +insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1; + +insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2; + +insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3; + +-- no columns in the projection +explain vectorization select * from parquet_project_test; +select * from parquet_project_test; + +-- no columns in the projection, just count(*) +explain vectorization select count(*) from parquet_project_test; +select count(*) from parquet_project_test; + +-- project a primitive type +explain vectorization select cint, count(*) from parquet_project_test +group by cint; + +select cint, count(*) from parquet_project_test +group by cint; + +-- test complex type in projection, this should not get vectorized +explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"]; + +select m1["color"], count(*) from parquet_project_test +group by m1["color"]; diff --git a/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out new file mode 100644 index 0000000000000000000000000000000000000000..22b2859059b5164cdaa67cf5ce467338f9ea45fd --- /dev/null +++ b/ql/src/test/results/clientpositive/spark/vectorization_parquet_projection.q.out @@ -0,0 +1,459 @@ +PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: explain vectorization select * from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select * from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int), m1 (type: map) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Vectorizing complex type MAP not enabled (map) since hive.vectorized.complex.types.enabled IS false + vectorized: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +PREHOOK: query: explain vectorization select count(*) from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + groupByVectorOutput: true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +22 +PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int) + outputColumnNames: cint + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: cint (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + groupByVectorOutput: true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +1 8 +2 7 +3 7 +PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: m1['color'] (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Vectorizing complex type MAP not enabled (map) since hive.vectorized.complex.types.enabled IS false + vectorized: false + Reducer 2 + Execution mode: vectorized + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine spark IN [tez, spark] IS true + groupByVectorOutput: true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +blue 7 +green 7 +red 8 diff --git a/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out new file mode 100644 index 0000000000000000000000000000000000000000..0423ba605f3e2d9f9a5fb653e445a4974a6df4eb --- /dev/null +++ b/ql/src/test/results/clientpositive/vectorization_parquet_projection.q.out @@ -0,0 +1,426 @@ +PREHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_types_staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: CREATE TABLE parquet_types_staging ( + cint int, + ctinyint tinyint, + csmallint smallint, + cfloat float, + cdouble double, + cstring1 string, + t timestamp, + cchar char(5), + cvarchar varchar(10), + cbinary string, + m1 map, + l1 array, + st1 struct, + d date +) ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +COLLECTION ITEMS TERMINATED BY ',' +MAP KEYS TERMINATED BY ':' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@parquet_types_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/parquet_types.txt' OVERWRITE INTO TABLE parquet_types_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@parquet_types_staging +PREHOOK: query: DROP TABLE IF EXISTS parquet_project_test +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS parquet_project_test +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: CREATE TABLE parquet_project_test( +cint int, +m1 map +) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_project_test +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","red") from parquet_types_staging +where ctinyint = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","green") from parquet_types_staging +where ctinyint = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_types_staging +PREHOOK: Output: default@parquet_project_test +POSTHOOK: query: insert into parquet_project_test +select ctinyint, map("color","blue") from parquet_types_staging +where ctinyint = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_types_staging +POSTHOOK: Output: default@parquet_project_test +POSTHOOK: Lineage: parquet_project_test.cint EXPRESSION [] +POSTHOOK: Lineage: parquet_project_test.m1 EXPRESSION [] +PREHOOK: query: explain vectorization select * from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select * from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int), m1 (type: map) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Vectorizing complex type MAP not enabled (map) since hive.vectorized.complex.types.enabled IS false + vectorized: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +1 {"color":"red"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +2 {"color":"green"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +3 {"color":"blue"} +PREHOOK: query: explain vectorization select count(*) from parquet_project_test +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select count(*) from parquet_project_test +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from parquet_project_test +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from parquet_project_test +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +22 +PREHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cint (type: int) + outputColumnNames: cint + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: cint (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select cint, count(*) from parquet_project_test +group by cint +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select cint, count(*) from parquet_project_test +group by cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +1 8 +2 7 +3 7 +PREHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquet_project_test + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: m1['color'] (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 22 Data size: 44 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + notVectorizedReason: Select expression for SELECT operator: Vectorizing complex type MAP not enabled (map) since hive.vectorized.complex.types.enabled IS false + vectorized: false + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 22 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +POSTHOOK: query: select m1["color"], count(*) from parquet_project_test +group by m1["color"] +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_project_test +#### A masked pattern was here #### +blue 7 +green 7 +red 8