diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index 24d1681..ee95be7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -117,6 +117,7 @@ import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.ql.exec.LimitOperator; import org.apache.hadoop.mapred.InputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -1030,8 +1031,15 @@ public static TableScanOperator createTemporaryFile( desc.setCompressType(parseCtx.getConf().getVar( HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE)); } + Operator pp = parent; + RowSchema tmpSchema = pp.getSchema(); + while (pp instanceof LimitOperator) { + pp = pp.getParentOperators().get(0); + tmpSchema = pp.getSchema(); + } + Operator fileSinkOp = OperatorFactory.get( - parent.getCompilationOpContext(), desc, parent.getSchema()); + parent.getCompilationOpContext(), desc, tmpSchema); // Connect parent to fileSinkOp parent.replaceChild(child, fileSinkOp); @@ -1039,7 +1047,7 @@ public static TableScanOperator createTemporaryFile( // Create a dummy TableScanOperator for the file generated through fileSinkOp TableScanOperator tableScanOp = createTemporaryTableScanOperator( - parent.getCompilationOpContext(), parent.getSchema()); + parent.getCompilationOpContext(), tmpSchema); // Connect this TableScanOperator to child. tableScanOp.setChildOperators(Utilities.makeList(child)); @@ -1080,8 +1088,15 @@ private static void splitTasks(ReduceSinkOperator op, Path taskTmpDir = baseCtx.getMRTmpPath(); Operator parent = op.getParentOperators().get(0); + Operator pp = parent; + RowSchema tmpSchema = pp.getSchema(); + while (pp instanceof LimitOperator) { + pp = pp.getParentOperators().get(0); + tmpSchema = pp.getSchema(); + } + TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils - .getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol")); + .getFieldSchemasFromRowSchema(tmpSchema, "temporarycol")); // Create the temporary file, its corresponding FileSinkOperaotr, and // its corresponding TableScanOperator. diff --git a/ql/src/test/queries/clientpositive/with_column_pruner.q b/ql/src/test/queries/clientpositive/with_column_pruner.q new file mode 100644 index 0000000..0e7e545 --- /dev/null +++ b/ql/src/test/queries/clientpositive/with_column_pruner.q @@ -0,0 +1,18 @@ +DROP TABLE IF EXISTS atab; +CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING); +LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw'); + +DROP TABLE IF EXISTS mstab; +CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING); +LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311'); + +EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid; + +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid; diff --git a/ql/src/test/results/clientpositive/with_column_pruner.q.out b/ql/src/test/results/clientpositive/with_column_pruner.q.out new file mode 100644 index 0000000..3ef6b5f --- /dev/null +++ b/ql/src/test/results/clientpositive/with_column_pruner.q.out @@ -0,0 +1,170 @@ +PREHOOK: query: DROP TABLE IF EXISTS atab +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS atab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@atab +POSTHOOK: query: CREATE TABLE IF NOT EXISTS atab (ks_uid BIGINT, sr_uid STRING, sr_id STRING, tstamp STRING, m_id STRING, act STRING, at_sr_uid STRING, tstamp_type STRING, original_m_id STRING, original_tstamp STRING, registered_flag TINYINT, at_ks_uid BIGINT) PARTITIONED BY (dt STRING,nt STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@atab +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw') +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@atab +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v1.txt' INTO TABLE atab PARTITION (dt='20130311', nt='tw') +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@atab +POSTHOOK: Output: default@atab@dt=20130311/nt=tw +PREHOOK: query: DROP TABLE IF EXISTS mstab +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS mstab +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@mstab +POSTHOOK: query: CREATE TABLE mstab(ks_uid INT, csc INT) PARTITIONED BY (dt STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@mstab +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311') +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@mstab +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/v2.txt' INTO TABLE mstab PARTITION (dt='20130311') +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@mstab +POSTHOOK: Output: default@mstab@dt=20130311 +PREHOOK: query: EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: atab + Statistics: Num rows: 1 Data size: 384 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (ks_uid = 1111) (type: boolean) + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: sr_uid (type: string), registered_flag (type: tinyint), at_ks_uid (type: bigint), sr_id (type: string), tstamp (type: string), m_id (type: string), act (type: string), at_sr_uid (type: string), tstamp_type (type: string), original_m_id (type: string), original_tstamp (type: string) + outputColumnNames: _col1, _col10, _col11, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 2 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col1 (type: string), VALUE._col10 (type: tinyint), VALUE._col11 (type: bigint), VALUE._col2 (type: string), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: string), VALUE._col9 (type: string) + outputColumnNames: _col1, _col10, _col11, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 2 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: 1111 (type: bigint) + sort order: + + Map-reduce partition columns: 1111 (type: bigint) + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint) + TableScan + alias: mstab + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: UDFToLong(ks_uid) is not null (type: boolean) + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ks_uid (type: int), csc (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: UDFToLong(_col0) (type: bigint) + sort order: + + Map-reduce partition columns: UDFToLong(_col0) (type: bigint) + Statistics: Num rows: 12 Data size: 99 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 UDFToLong(_col0) (type: bigint) + outputColumnNames: _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col14, _col15 + Statistics: Num rows: 13 Data size: 108 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 1111 (type: bigint), _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: tinyint), _col11 (type: bigint), '20130311' (type: string), 'tw' (type: string), _col14 (type: int), _col15 (type: int), '20130311' (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16 + Statistics: Num rows: 13 Data size: 108 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 13 Data size: 108 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +PREHOOK: type: QUERY +PREHOOK: Input: default@atab +PREHOOK: Input: default@atab@dt=20130311/nt=tw +PREHOOK: Input: default@mstab +PREHOOK: Input: default@mstab@dt=20130311 +#### A masked pattern was here #### +POSTHOOK: query: WITH a AS ( SELECT * FROM atab WHERE dt='20130311' AND nt='tw' AND ks_uid=1111 LIMIT 2 ), +b AS ( SELECT * FROM mstab WHERE mstab.dt='20130311' ) +SELECT * FROM a JOIN b +ON a.ks_uid = b.ks_uid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@atab +POSTHOOK: Input: default@atab@dt=20130311/nt=tw +POSTHOOK: Input: default@mstab +POSTHOOK: Input: default@mstab@dt=20130311 +#### A masked pattern was here #### +1111 foo abc 2013-10-10 12:12:12 xyz fun bar 2013-10-10 12:12:12 lmn 2013-11-11 12:12:12 9 2222 20130311 tw 1111 99999 20130311