diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 27b53b8..52ef2d3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -101,6 +101,8 @@ import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression; +import org.apache.hadoop.hive.ql.io.NullRowsInputFormat; +import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; @@ -353,6 +355,14 @@ vectorDeserializeTextSupportSet.addAll(Arrays.asList(Support.values())); } + private static final Set supportedAcidInputFormats = new TreeSet(); + static { + supportedAcidInputFormats.add(OrcInputFormat.class.getName()); + // For metadataonly or empty rows optimizations, null/onerow input format can be selected. + supportedAcidInputFormats.add(NullRowsInputFormat.class.getName()); + supportedAcidInputFormats.add(OneNullRowInputFormat.class.getName()); + } + private BaseWork currentBaseWork; private Operator currentOperator; private Collection> vectorizedInputFormatExcludes; @@ -1201,7 +1211,7 @@ private boolean verifyAndSetVectorPartDesc( // Today, ACID tables are only ORC and that format is vectorizable. Verify these // assumptions. Preconditions.checkState(isInputFileFormatVectorized); - Preconditions.checkState(inputFileFormatClassName.equals(OrcInputFormat.class.getName())); + Preconditions.checkState(supportedAcidInputFormats.contains(inputFileFormatClassName)); if (!useVectorizedInputFileFormat) { enabledConditionsNotMetList.add("Vectorizing ACID tables requires " diff --git a/ql/src/test/queries/clientpositive/acid_nullscan.q b/ql/src/test/queries/clientpositive/acid_nullscan.q new file mode 100644 index 0000000..d048231 --- /dev/null +++ b/ql/src/test/queries/clientpositive/acid_nullscan.q @@ -0,0 +1,17 @@ + +set hive.mapred.mode=nonstrict; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.vectorized.execution.enabled=true; + +CREATE TABLE acid_vectorized(a INT, b STRING) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); +insert into table acid_vectorized select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10; +insert into table acid_vectorized values (1, 'bar'); + +explain extended +select sum(a) from acid_vectorized where false; + +select sum(a) from acid_vectorized where false; + diff --git a/ql/src/test/results/clientpositive/acid_nullscan.q.out b/ql/src/test/results/clientpositive/acid_nullscan.q.out new file mode 100644 index 0000000..7fcc831 --- /dev/null +++ b/ql/src/test/results/clientpositive/acid_nullscan.q.out @@ -0,0 +1,162 @@ +PREHOOK: query: CREATE TABLE acid_vectorized(a INT, b STRING) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: CREATE TABLE acid_vectorized(a INT, b STRING) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_vectorized +PREHOOK: query: insert into table acid_vectorized select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: insert into table acid_vectorized select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acid_vectorized +POSTHOOK: Lineage: acid_vectorized.a SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: acid_vectorized.b SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: insert into table acid_vectorized values (1, 'bar') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: insert into table acid_vectorized values (1, 'bar') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@acid_vectorized +POSTHOOK: Lineage: acid_vectorized.a SCRIPT [] +POSTHOOK: Lineage: acid_vectorized.b SCRIPT [] +PREHOOK: query: explain extended +select sum(a) from acid_vectorized where false +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select sum(a) from acid_vectorized where false +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_vectorized + Statistics: Num rows: 1 Data size: 24510 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: false (type: boolean) + Statistics: Num rows: 1 Data size: 24510 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(a) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Execution mode: vectorized + Path -> Alias: + nullscan://null/default.acid_vectorized/part_ [acid_vectorized] + Path -> Partition: + nullscan://null/default.acid_vectorized/part_ + Partition + input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 2 + bucket_field_name a + column.name.delimiter , + columns a,b + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.acid_vectorized + numFiles 3 + numRows 0 + rawDataSize 0 + serialization.ddl struct acid_vectorized { i32 a, string b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + totalSize 2451 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.NullStructSerDe + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 2 + bucket_field_name a + column.name.delimiter , + columns a,b + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.acid_vectorized + numFiles 3 + numRows 0 + rawDataSize 0 + serialization.ddl struct acid_vectorized { i32 a, string b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 2451 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.acid_vectorized + name: default.acid_vectorized + Truncated Path -> Alias: + nullscan://null/default.acid_vectorized/part_ [acid_vectorized] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select sum(a) from acid_vectorized where false +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +POSTHOOK: query: select sum(a) from acid_vectorized where false +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +NULL