diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java index 00ec03e..e0b5424 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java @@ -611,12 +611,15 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, colLists = mergeFieldNodesWithDesc(colLists, valCols.get(index)); } - Collections.sort(colLists, new Comparator() { - @Override - public int compare(FieldNode o1, FieldNode o2) { - return o1.getFieldName().compareTo(o2.getFieldName()); - } - }); + if (!(child instanceof CommonJoinOperator)) { + Collections.sort(colLists, new Comparator() { + @Override + public int compare(FieldNode o1, FieldNode o2) { + return o1.getFieldName().compareTo(o2.getFieldName()); + } + }); + } + pruneReduceSinkOperator(flags, op, cppCtx); cppCtx.getPrunedColLists().put(op, colLists); return null; diff --git a/ql/src/test/queries/clientpositive/with_column_pruner.q b/ql/src/test/queries/clientpositive/with_column_pruner.q new file mode 100644 index 0000000..0e7e545 --- /dev/null +++ b/ql/src/test/queries/clientpositive/with_column_pruner.q @@ -0,0 +1,18 @@ +DROP TABLE IF EXISTS atab; +CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING); +LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw'); + +DROP TABLE IF EXISTS mstab; +CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING); +LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311'); + +EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid; + +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid; diff --git a/ql/src/test/results/clientpositive/with_column_pruner.q.out b/ql/src/test/results/clientpositive/with_column_pruner.q.out new file mode 100644 index 0000000..98d6c72 --- /dev/null +++ b/ql/src/test/results/clientpositive/with_column_pruner.q.out @@ -0,0 +1,170 @@ +PREHOOK: query: DROP TABLE IF EXISTS atab +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS atab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@atab +POSTHOOK: query: CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@atab +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw') +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@atab +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw') +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@atab +POSTHOOK: Output: default@atab@dt=20130311/nt=tw +PREHOOK: query: DROP TABLE IF EXISTS mstab +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS mstab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@mstab +POSTHOOK: query: CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@mstab +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311') +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@mstab +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311') +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@mstab +POSTHOOK: Output: default@mstab@dt=20130311 +PREHOOK: query: EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: atab + Statistics: Num rows: 1 Data size: 384 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (ks_uid = 1111) (type: boolean) + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: sr_uid (type: string), registered_flag (type: tinyint), at_ks_uid (type: bigint), sr_id (type: string), tstamp (type: string), m_id (type: string), act (type: string), at_sr_uid (type: string), tstamp_type (type: string), original_m_id (type: string), original_tstamp (type: string) + outputColumnNames: _col1, _col10, _col11, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 2 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: string), VALUE._col9 (type: string), VALUE._col10 (type: tinyint), VALUE._col11 (type: bigint) + outputColumnNames: _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 2 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: 1111 (type: bigint) + sort order: + + Map-reduce partition columns: 1111 (type: bigint) + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint) + TableScan + alias: mstab + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: UDFToLong(ks_uid) is not null (type: boolean) + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ks_uid (type: int), csc (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: UDFToLong(_col0) (type: bigint) + sort order: + + Map-reduce partition columns: UDFToLong(_col0) (type: bigint) + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 UDFToLong(_col0) (type: bigint) + outputColumnNames: _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col14, _col15 + Statistics: Num rows: 13 Data size: 108 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 1111 (type: bigint), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint), '20130311' (type: string), 'tw' (type: string), _col14 (type: int), _col15 (type: int), '20130311' (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16 + Statistics: Num rows: 13 Data size: 108 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 13 Data size: 108 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +PREHOOK: type: QUERY +PREHOOK: Input: default@atab +PREHOOK: Input: default@atab@dt=20130311/nt=tw +PREHOOK: Input: default@mstab +PREHOOK: Input: default@mstab@dt=20130311 +#### A masked pattern was here #### +POSTHOOK: query: WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@atab +POSTHOOK: Input: default@atab@dt=20130311/nt=tw +POSTHOOK: Input: default@mstab +POSTHOOK: Input: default@mstab@dt=20130311 +#### A masked pattern was here #### +1111 foo abc 2013-10-10 12:12:12 xyz fun bar 2013-10-10 12:12:12 lmn 2013-11-11 12:12:12 9 2222 20130311 tw 1111 99999 20130311