diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index c76026b..4918825 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.plan.VectorTableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.stats.StatsCollectionContext; import org.apache.hadoop.hive.ql.stats.StatsPublisher; @@ -115,7 +116,7 @@ public String getSchemaEvolutionColumnsTypes() { @Override public void process(Object row, int tag) throws HiveException { if (rowLimit >= 0) { - if (row instanceof VectorizedRowBatch) { + if (vectorized) { // We need to check with 'instanceof' instead of just checking // vectorized because the row can be a VectorizedRowBatch when // FetchOptimizer kicks in even if the operator pipeline is not @@ -134,7 +135,7 @@ public void process(Object row, int tag) throws HiveException { return; } } - if (conf != null && conf.isGatherStats()) { + if (!vectorized && conf != null && conf.isGatherStats()) { gatherStats(row); } forward(row, inputObjInspectors[tag], vectorized); @@ -254,6 +255,9 @@ protected void initializeOp(Configuration hconf) throws HiveException { } rowLimit = conf.getRowLimit(); + if (!conf.isGatherStats()) { + return; + } if (hconf instanceof JobConf) { jc = (JobConf) hconf; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 6500682..8ce2c33 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -1513,7 +1513,7 @@ private boolean validateAndVectorizeMapWork(MapWork mapWork, VectorTaskColumnInf LOG.info("Examining input format to see if vectorization is enabled."); ImmutablePair onlyOneTableScanPair = verifyOnlyOneTableScanOperator(mapWork); - if (onlyOneTableScanPair == null) { + if (onlyOneTableScanPair == null) { VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason(); Preconditions.checkState(notVectorizedReason != null); mapWork.setVectorizationEnabledConditionsNotMet(Arrays.asList(new String[] {notVectorizedReason.toString()})); @@ -1638,6 +1638,11 @@ private boolean validateAndVectorizeMapOperators(MapWork mapWork, TableScanOpera // Set "global" member indicating where to store "not vectorized" information if necessary. currentBaseWork = mapWork; + if (!validateTableScanOperator(tableScanOperator, mapWork)) { + + // The "not vectorized" information has been stored in the MapWork vertex. + return false; + } try { validateAndVectorizeMapOperators(tableScanOperator, isTezOrSpark, vectorTaskColumnInfo); } catch (VectorizerCannotVectorizeException e) { diff --git ql/src/test/queries/clientpositive/vector_gather_stats.q ql/src/test/queries/clientpositive/vector_gather_stats.q new file mode 100644 index 0000000..deaca0e --- /dev/null +++ ql/src/test/queries/clientpositive/vector_gather_stats.q @@ -0,0 +1,26 @@ +set hive.vectorized.execution.enabled=true; + +-- HIVE-18191 + +create table cd +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +partitioned by +( + cd_education_status string +); +alter table cd add partition (cd_education_status='Primary'); +insert into table cd partition (cd_education_status='Primary') values (1, 'M', 'M', 500, 'Good', 0, 0, 0); + +explain vectorization detail +analyze table cd partition (cd_education_status) compute statistics; + +analyze table cd partition (cd_education_status) compute statistics; \ No newline at end of file diff --git ql/src/test/results/clientpositive/vector_gather_stats.q.out ql/src/test/results/clientpositive/vector_gather_stats.q.out new file mode 100644 index 0000000..cf9bc85 --- /dev/null +++ ql/src/test/results/clientpositive/vector_gather_stats.q.out @@ -0,0 +1,101 @@ +PREHOOK: query: create table cd +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +partitioned by +( + cd_education_status string +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@cd +POSTHOOK: query: create table cd +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +partitioned by +( + cd_education_status string +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@cd +PREHOOK: query: alter table cd add partition (cd_education_status='Primary') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@cd +POSTHOOK: query: alter table cd add partition (cd_education_status='Primary') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@cd +POSTHOOK: Output: default@cd@cd_education_status=Primary +PREHOOK: query: insert into table cd partition (cd_education_status='Primary') values (1, 'M', 'M', 500, 'Good', 0, 0, 0) +PREHOOK: type: QUERY +PREHOOK: Output: default@cd@cd_education_status=Primary +POSTHOOK: query: insert into table cd partition (cd_education_status='Primary') values (1, 'M', 'M', 500, 'Good', 0, 0, 0) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@cd@cd_education_status=Primary +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_credit_rating SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col5, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_demo_sk EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_dep_college_count EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col8, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_dep_count EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col6, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_dep_employed_count EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col7, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_gender SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_marital_status SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_purchase_estimate EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: explain vectorization detail +analyze table cd partition (cd_education_status) compute statistics +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +analyze table cd partition (cd_education_status) compute statistics +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: cd + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + notVectorizedReason: TABLESCAN operator: gather stats not supported + vectorized: false + + Stage: Stage-1 + Stats Work + Basic Stats Work: + +PREHOOK: query: analyze table cd partition (cd_education_status) compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@cd +PREHOOK: Input: default@cd@cd_education_status=Primary +PREHOOK: Output: default@cd +PREHOOK: Output: default@cd@cd_education_status=Primary +POSTHOOK: query: analyze table cd partition (cd_education_status) compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cd +POSTHOOK: Input: default@cd@cd_education_status=Primary +POSTHOOK: Output: default@cd +POSTHOOK: Output: default@cd@cd_education_status=Primary