diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPruner.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPruner.java index 7e39d77..d59603e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPruner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPruner.java @@ -158,14 +158,15 @@ protected void walk(Node nd) throws SemanticException { boolean walkChildren = true; opStack.push(nd); - // no need to go further down for a select op with a file sink or script - // child - // since all cols are needed for these ops + // no need to go further down for a select op with all file sink or script + // child since all cols are needed for these ops + // However, if one of the children is not file sink or script, we still go down. if (nd instanceof SelectOperator) { + walkChildren = false; for (Node child : nd.getChildren()) { - if ((child instanceof FileSinkOperator) - || (child instanceof ScriptOperator)) { - walkChildren = false; + if (!(child instanceof FileSinkOperator || child instanceof ScriptOperator)) { + walkChildren = true; + break; } } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index f2a6ade..5055d9b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -7432,11 +7432,9 @@ Operator genConversionSelectOperator(String dest, QB qb, Operator input, rowFields.get(rowFieldsOffset).getInternalName(), "", false, rowFields.get(rowFieldsOffset).isSkewedCol()); // LazySimpleSerDe can convert any types to String type using - // JSON-format. - if (!tableFieldTypeInfo.equals(rowFieldTypeInfo) - && !(isLazySimpleSerDe - && tableFieldTypeInfo.getCategory().equals(Category.PRIMITIVE) && tableFieldTypeInfo - .equals(TypeInfoFactory.stringTypeInfo))) { + // JSON-format. However, we may add more operators. + // Thus, we still keep the conversion. + if (!tableFieldTypeInfo.equals(rowFieldTypeInfo)) { // need to do some conversions here converted = true; if (tableFieldTypeInfo.getCategory() != Category.PRIMITIVE) { diff --git a/ql/src/test/queries/clientpositive/column_pruner_multiple_children.q b/ql/src/test/queries/clientpositive/column_pruner_multiple_children.q new file mode 100644 index 0000000..9315239 --- /dev/null +++ b/ql/src/test/queries/clientpositive/column_pruner_multiple_children.q @@ -0,0 +1,19 @@ +set hive.map.aggr=false; +set hive.stats.column.autogather=true; + +CREATE TABLE DEST1(key INT, value STRING) STORED AS TEXTFILE; + +create table s as select * from src where key='10'; + +explain FROM S +INSERT OVERWRITE TABLE DEST1 SELECT key, sum(SUBSTR(value,5)) GROUP BY key +; + +FROM S +INSERT OVERWRITE TABLE DEST1 SELECT key, sum(SUBSTR(value,5)) GROUP BY key +; + +desc formatted DEST1; + +desc formatted DEST1 key; +desc formatted DEST1 value; diff --git a/ql/src/test/results/clientpositive/column_pruner_multiple_children.q.out b/ql/src/test/results/clientpositive/column_pruner_multiple_children.q.out new file mode 100644 index 0000000..96feeed --- /dev/null +++ b/ql/src/test/results/clientpositive/column_pruner_multiple_children.q.out @@ -0,0 +1,189 @@ +PREHOOK: query: CREATE TABLE DEST1(key INT, value STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@DEST1 +POSTHOOK: query: CREATE TABLE DEST1(key INT, value STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@DEST1 +PREHOOK: query: create table s as select * from src where key='10' +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +PREHOOK: Output: database:default +PREHOOK: Output: default@s +POSTHOOK: query: create table s as select * from src where key='10' +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: database:default +POSTHOOK: Output: default@s +POSTHOOK: Lineage: s.key SIMPLE [] +POSTHOOK: Lineage: s.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain FROM S +INSERT OVERWRITE TABLE DEST1 SELECT key, sum(SUBSTR(value,5)) GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: explain FROM S +INSERT OVERWRITE TABLE DEST1 SELECT key, sum(SUBSTR(value,5)) GROUP BY key +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-2, Stage-3 + Stage-3 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: s + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), substr(value, 5) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToInteger(_col0) (type: int), UDFToString(_col1) (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + Select Operator + expressions: _col0 (type: int), _col1 (type: string) + outputColumnNames: key, value + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-4 + Column Stats Work + Column Stats Desc: + Columns: key, value + Column Types: int, string + Table: default.dest1 + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 9 Basic stats: COMPLETE Column stats: NONE + value expressions: key (type: int), value (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0, 16), compute_stats(VALUE._col2, 16) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + +PREHOOK: query: FROM S +INSERT OVERWRITE TABLE DEST1 SELECT key, sum(SUBSTR(value,5)) GROUP BY key +PREHOOK: type: QUERY +PREHOOK: Input: default@s +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM S +INSERT OVERWRITE TABLE DEST1 SELECT key, sum(SUBSTR(value,5)) GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@s +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.key EXPRESSION [(s)s.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: dest1.value EXPRESSION [(s)s.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: desc formatted DEST1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@dest1 +POSTHOOK: query: desc formatted DEST1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@dest1 +# col_name data_type comment + +key int +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} + numFiles 1 + numRows 1 + rawDataSize 7 + totalSize 8 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted DEST1 key +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@dest1 +POSTHOOK: query: desc formatted DEST1 key +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@dest1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 10 10 0 1 from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} +PREHOOK: query: desc formatted DEST1 value +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@dest1 +POSTHOOK: query: desc formatted DEST1 value +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@dest1 +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value string 0 1 4.0 4 from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}