diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java index 57ce849..4bbae63 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java @@ -305,7 +305,7 @@ private RowResolver buildPrunedRR(List prunedCols, /* * add any input columns referenced in WindowFn args or expressions. */ - private ArrayList prunedColumnsList(List prunedCols, + private ArrayList prunedColumnsList(List prunedCols, WindowTableFunctionDef tDef) { //we create a copy of prunedCols to create a list of pruned columns for PTFOperator ArrayList mergedColList = new ArrayList(prunedCols); @@ -518,7 +518,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, colLists = Utilities.mergeUniqElems(colLists, valCols.get(index).getCols()); } - Collections.sort(colLists); + if (!(child instanceof CommonJoinOperator)) + Collections.sort(colLists); pruneReduceSinkOperator(flags, op, cppCtx); cppCtx.getPrunedColLists().put(op, colLists); return null; @@ -628,7 +629,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, ((SelectDesc)select.getConf()).setColList(colList); ((SelectDesc)select.getConf()).setOutputColumnNames(outputColNames); pruneOperator(ctx, select, outputColNames); - + Operator udtfPath = op.getChildOperators().get(LateralViewJoinOperator.UDTF_TAG); List lvFCols = new ArrayList(cppCtx.getPrunedColLists().get(udtfPath)); lvFCols = Utilities.mergeUniqElems(lvFCols, outputColNames); diff --git a/ql/src/test/queries/clientpositive/with_column_pruner.q b/ql/src/test/queries/clientpositive/with_column_pruner.q new file mode 100644 index 0000000..f34ffc7 --- /dev/null +++ b/ql/src/test/queries/clientpositive/with_column_pruner.q @@ -0,0 +1,20 @@ +DROP TABLE IF EXISTS atab; +CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING); +LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw'); + +DROP TABLE IF EXISTS mstab; +CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING); +LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311'); + +EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +LIMIT 100; + +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +LIMIT 100; diff --git a/ql/src/test/results/clientpositive/with_column_pruner.q.out b/ql/src/test/results/clientpositive/with_column_pruner.q.out new file mode 100644 index 0000000..8824e9c --- /dev/null +++ b/ql/src/test/results/clientpositive/with_column_pruner.q.out @@ -0,0 +1,176 @@ +PREHOOK: query: DROP TABLE IF EXISTS atab +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS atab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@atab +POSTHOOK: query: CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@atab +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw') +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@atab +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw') +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@atab +POSTHOOK: Output: default@atab@dt=20130311/nt=tw +PREHOOK: query: DROP TABLE IF EXISTS mstab +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS mstab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@mstab +POSTHOOK: query: CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@mstab +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311') +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@mstab +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311') +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@mstab +POSTHOOK: Output: default@mstab@dt=20130311 +PREHOOK: query: EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +LIMIT 100 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +LIMIT 100 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: atab + Statistics: Num rows: 0 Data size: 384 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (ks_uid = 1111) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: sr_uid (type: string), sr_id (type: string), tstamp (type: string), m_id (type: string), act (type: string), at_sr_uid (type: string), tstamp_type (type: string), original_m_id (type: string), original_tstamp (type: string), registered_flag (type: tinyint), at_ks_uid (type: bigint) + outputColumnNames: _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Limit + Number of rows: 2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: string), VALUE._col9 (type: string), VALUE._col10 (type: tinyint), VALUE._col11 (type: bigint) + outputColumnNames: _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Limit + Number of rows: 2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: 1111 (type: bigint) + sort order: + + Map-reduce partition columns: 1111 (type: bigint) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint) + TableScan + alias: mstab + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: UDFToLong(ks_uid) is not null (type: boolean) + Statistics: Num rows: 6 Data size: 49 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ks_uid (type: int), csc (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 49 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: UDFToLong(_col0) (type: bigint) + sort order: + + Map-reduce partition columns: UDFToLong(_col0) (type: bigint) + Statistics: Num rows: 6 Data size: 49 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 UDFToLong(_col0) (type: bigint) + outputColumnNames: _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col14, _col15 + Statistics: Num rows: 6 Data size: 53 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 1111 (type: bigint), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint), '20130311' (type: string), 'tw' (type: string), _col14 (type: int), _col15 (type: int), '20130311' (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16 + Statistics: Num rows: 6 Data size: 53 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 100 + Statistics: Num rows: 6 Data size: 53 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 53 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 100 + Processor Tree: + ListSink + +PREHOOK: query: WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +LIMIT 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@atab +PREHOOK: Input: default@atab@dt=20130311/nt=tw +PREHOOK: Input: default@mstab +PREHOOK: Input: default@mstab@dt=20130311 +#### A masked pattern was here #### +POSTHOOK: query: WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +LIMIT 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@atab +POSTHOOK: Input: default@atab@dt=20130311/nt=tw +POSTHOOK: Input: default@mstab +POSTHOOK: Input: default@mstab@dt=20130311 +#### A masked pattern was here #### +1111 foo abc 2013-10-10 12:12:12 xyz fun bar 2013-10-10 12:12:12 lmn 2013-11-11 12:12:12 9 2222 20130311 tw 1111 99999 20130311