diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java index 00ec03e..689bdc4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java @@ -810,11 +810,18 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, ArrayList newOutputColumnNames = new ArrayList(); ArrayList rs_oldsignature = op.getSchema().getSignature(); ArrayList rs_newsignature = new ArrayList(); + // The pruning needs to preserve the order of columns in the input schema + ArrayList colNames = new ArrayList(); for (FieldNode col : cols) { - int index = originalOutputColumnNames.indexOf(col.getFieldName()); - newOutputColumnNames.add(col.getFieldName()); - newColList.add(originalColList.get(index)); - rs_newsignature.add(rs_oldsignature.get(index)); + colNames.add(col.getFieldName()); + } + for (int i = 0; i < originalOutputColumnNames.size(); i++) { + String colName = originalOutputColumnNames.get(i); + if (colNames.contains(colName)) { + newOutputColumnNames.add(colName); + newColList.add(originalColList.get(i)); + rs_newsignature.add(rs_oldsignature.get(i)); + } } op.getSchema().setSignature(rs_newsignature); conf.setColList(newColList); diff --git ql/src/test/queries/clientpositive/select_column_pruning.q ql/src/test/queries/clientpositive/select_column_pruning.q new file mode 100644 index 0000000..207cd32 --- /dev/null +++ ql/src/test/queries/clientpositive/select_column_pruning.q @@ -0,0 +1,4 @@ +CREATE TABLE lv_table1( c1 STRING, c2 ARRAY, c3 INT, c4 CHAR(1), c5 STRING, c6 STRING, c7 STRING, c8 STRING, c9 STRING, c10 STRING, c11 STRING); +INSERT OVERWRITE TABLE lv_table1 SELECT 'abc ', array(1,2,3), 100, 't', 'test', 'test', 'test', 'test', 'test', 'test', 'test' FROM src; +EXPLAIN SELECT * FROM lv_table1 LATERAL VIEW explode(array(1,2,3)) myTable AS myCol WHERE c3 = 100 SORT BY c1 ASC, myCol ASC LIMIT 1; +SELECT * FROM lv_table1 LATERAL VIEW explode(array(1,2,3)) myTable AS myCol WHERE c3 = 100 SORT BY c1 ASC, myCol ASC LIMIT 1; \ No newline at end of file diff --git ql/src/test/results/clientpositive/select_column_pruning.q.out ql/src/test/results/clientpositive/select_column_pruning.q.out new file mode 100644 index 0000000..c33db41 --- /dev/null +++ ql/src/test/results/clientpositive/select_column_pruning.q.out @@ -0,0 +1,141 @@ +PREHOOK: query: CREATE TABLE lv_table1( c1 STRING, c2 ARRAY, c3 INT, c4 CHAR(1), c5 STRING, c6 STRING, c7 STRING, c8 STRING, c9 STRING, c10 STRING, c11 STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@lv_table1 +POSTHOOK: query: CREATE TABLE lv_table1( c1 STRING, c2 ARRAY, c3 INT, c4 CHAR(1), c5 STRING, c6 STRING, c7 STRING, c8 STRING, c9 STRING, c10 STRING, c11 STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@lv_table1 +PREHOOK: query: INSERT OVERWRITE TABLE lv_table1 SELECT 'abc ', array(1,2,3), 100, 't', 'test', 'test', 'test', 'test', 'test', 'test', 'test' FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@lv_table1 +POSTHOOK: query: INSERT OVERWRITE TABLE lv_table1 SELECT 'abc ', array(1,2,3), 100, 't', 'test', 'test', 'test', 'test', 'test', 'test', 'test' FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@lv_table1 +POSTHOOK: Lineage: lv_table1.c1 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c10 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c11 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c2 EXPRESSION [] +POSTHOOK: Lineage: lv_table1.c3 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c4 EXPRESSION [] +POSTHOOK: Lineage: lv_table1.c5 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c6 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c7 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c8 SIMPLE [] +POSTHOOK: Lineage: lv_table1.c9 SIMPLE [] +PREHOOK: query: EXPLAIN SELECT * FROM lv_table1 LATERAL VIEW explode(array(1,2,3)) myTable AS myCol WHERE c3 = 100 SORT BY c1 ASC, myCol ASC LIMIT 1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT * FROM lv_table1 LATERAL VIEW explode(array(1,2,3)) myTable AS myCol WHERE c3 = 100 SORT BY c1 ASC, myCol ASC LIMIT 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: lv_table1 + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (c3 = 100) (type: boolean) + Statistics: Num rows: 250 Data size: 13000 Basic stats: COMPLETE Column stats: NONE + Lateral View Forward + Statistics: Num rows: 250 Data size: 13000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: c1 (type: string), c2 (type: array), c4 (type: char(1)), c5 (type: string), c6 (type: string), c7 (type: string), c8 (type: string), c9 (type: string), c10 (type: string), c11 (type: string) + outputColumnNames: c1, c2, c4, c5, c6, c7, c8, c9, c10, c11 + Statistics: Num rows: 250 Data size: 13000 Basic stats: COMPLETE Column stats: NONE + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col14 + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: array), _col3 (type: char(1)), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string), _col14 (type: int) + outputColumnNames: _col0, _col1, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col11 (type: int) + sort order: ++ + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: array), _col3 (type: char(1)), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string) + Select Operator + expressions: array(1,2,3) (type: array) + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 13000 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 250 Data size: 13000 Basic stats: COMPLETE Column stats: NONE + function name: explode + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col14 + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: array), _col3 (type: char(1)), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string), _col14 (type: int) + outputColumnNames: _col0, _col1, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col11 (type: int) + sort order: ++ + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: array), _col3 (type: char(1)), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: array), VALUE._col2 (type: char(1)), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: string), VALUE._col9 (type: string), KEY.reducesinkkey1 (type: int) + outputColumnNames: _col0, _col1, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 500 Data size: 26000 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 52 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string), _col11 (type: int) + sort order: ++ + Statistics: Num rows: 1 Data size: 52 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: array), _col3 (type: char(1)), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: array), 100 (type: int), VALUE._col2 (type: char(1)), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: string), VALUE._col9 (type: string), KEY.reducesinkkey1 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 52 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 52 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 52 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT * FROM lv_table1 LATERAL VIEW explode(array(1,2,3)) myTable AS myCol WHERE c3 = 100 SORT BY c1 ASC, myCol ASC LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@lv_table1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM lv_table1 LATERAL VIEW explode(array(1,2,3)) myTable AS myCol WHERE c3 = 100 SORT BY c1 ASC, myCol ASC LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@lv_table1 +#### A masked pattern was here #### +abc [1,2,3] 100 t test test test test test test test 1