commit 10d0625df9569261a2edec01cd3146c6577bb724 Author: Vihang Karajgaonkar Date: Wed Jan 10 14:18:12 2018 -0800 HIVE-18422 : Vectorized input format should not be used when input format is excluded and row.serde is enabled diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 6f289703a70b2fc2c9cf6fc38707ae0fc4a68809..4e001679b306d10edb0bb7859ce9a3842febc889 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -524,7 +524,9 @@ public Vectorizer() { int partitionColumnCount; List availableVirtualColumnList; List neededVirtualColumnList; - boolean useVectorizedInputFileFormat; + //not to be confused with useVectorizedInputFileFormat at Vectorizer level + //which represents the value of configuration hive.vectorized.use.vectorized.input.format + private boolean useVectorizedInputFileFormat; Set inputFormatSupportSet; Set supportSetInUse; @@ -1299,6 +1301,20 @@ private boolean verifyAndSetVectorPartDesc( return false; } + private boolean shouldUseVectorizedInputFormat(Set inputFileFormatClassNames) { + if (inputFileFormatClassNames == null || inputFileFormatClassNames.isEmpty() + || !useVectorizedInputFileFormat) { + return useVectorizedInputFileFormat; + } + //Global config of vectorized input format is enabled; check if these inputformats are excluded + for (String inputFormat : inputFileFormatClassNames) { + if(isInputFormatExcluded(inputFormat, vectorizedInputFormatExcludes)) { + return false; + } + } + return true; + } + private boolean isInputFormatExcluded(String inputFileFormatClassName, Collection> excludes) { Class ifClass = null; try { @@ -1494,7 +1510,8 @@ private boolean isInputFormatExcluded(String inputFileFormatClassName, Collectio vectorTaskColumnInfo.setDataColumnNums(dataColumnNums); vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount); vectorTaskColumnInfo.setAvailableVirtualColumnList(availableVirtualColumnList); - vectorTaskColumnInfo.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); + vectorTaskColumnInfo.setUseVectorizedInputFileFormat( + shouldUseVectorizedInputFormat(inputFileFormatClassNameSet)); vectorTaskColumnInfo.setInputFormatSupportSet(inputFormatSupportSet); diff --git a/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q b/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q index f8e82455e059a4ed25b1e1d2dce7f2fe0a6bffa3..8d22efd9aad160b891b44314871fd523095ce652 100644 --- a/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q +++ b/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q @@ -167,3 +167,27 @@ select ctinyint, stddev_pop(cdouble) from alltypes_orc group by ctinyint; + +-- test when input format is excluded row serde is used for vectorization +set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +set hive.vectorized.use.vectorized.input.format=true; +set hive.vectorized.use.row.serde.deserialize=true; +set hive.vectorized.row.serde.inputformat.excludes=; + +create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc; + +insert into orcTbl values (54, 9), (-104, 25), (-112, 24); + +explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10; + +select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10; + +create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet; + +insert into parquetTbl values (54, 9), (-104, 25), (-112, 24); + +explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10; + +SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10; diff --git a/ql/src/test/results/clientpositive/llap/vectorization_input_format_excludes.q.out b/ql/src/test/results/clientpositive/llap/vectorization_input_format_excludes.q.out index ab8752a8d22fb6288396896368124ff7c6561762..8c22db1e671d74b2a342bb85ca600028fedc28b5 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_input_format_excludes.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_input_format_excludes.q.out @@ -1402,3 +1402,165 @@ POSTHOOK: Input: default@alltypes_orc 8 1070764888 -15778 1034 8.0 9562.355155774725 9 626923679 -13629 25 9.0 10157.217948808622 NULL 1073418988 -16379 3115 NULL 305051.4870777435 +PREHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orcTbl +POSTHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orcTbl +PREHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Output: default@orctbl +POSTHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orctbl +POSTHOOK: Lineage: orctbl.t1 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: orctbl.t2 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orctbl + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@orctbl +#### A masked pattern was here #### +POSTHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orctbl +#### A masked pattern was here #### +54 9 63 +PREHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquetTbl +POSTHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquetTbl +PREHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Output: default@parquettbl +POSTHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquettbl +POSTHOOK: Lineage: parquettbl.t1 EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: parquettbl.t2 EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquettbl + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs (cache only) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquettbl +#### A masked pattern was here #### +POSTHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquettbl +#### A masked pattern was here #### +54 9 63 diff --git a/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out b/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out index 887f44bb51665547530430a3ff2f3469ad68bebc..9a6b04e6a91762a46ff1912041527b37c1e98710 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out @@ -1386,3 +1386,163 @@ POSTHOOK: Input: default@alltypes_orc 8 1070764888 -15778 1034 8.0 9562.355155774725 9 626923679 -13629 25 9.0 10157.217948808622 NULL 1073418988 -16379 3115 NULL 305051.4870777435 +PREHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orcTbl +POSTHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orcTbl +PREHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Output: default@orctbl +POSTHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orctbl +POSTHOOK: Lineage: orctbl.t1 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: orctbl.t2 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orctbl + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@orctbl +#### A masked pattern was here #### +POSTHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orctbl +#### A masked pattern was here #### +54 9 63 +PREHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquetTbl +POSTHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquetTbl +PREHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Output: default@parquettbl +POSTHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquettbl +POSTHOOK: Lineage: parquettbl.t1 EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: parquettbl.t2 EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquettbl + Statistics: Num rows: 3 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquettbl +#### A masked pattern was here #### +POSTHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquettbl +#### A masked pattern was here #### +54 9 63