Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1440586)
+++ conf/hive-default.xml.template (working copy)
@@ -1138,6 +1138,14 @@
+ hive.stats.collect.scancols
+ false
+ Whether column accesses are tracked in the QueryPlan.
+ This is useful to identify how tables are accessed and to determine if there are wasted columns that can be trimmed.
+
+
+
+
hive.stats.ndv.error
20.0
Standard error expressed in percentage. Provides a tradeoff between accuracy and compute cost.A lower value for error indicates higher accuracy and a higher compute cost.
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1440586)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -566,6 +566,8 @@
HIVE_STATS_RELIABLE("hive.stats.reliable", false),
// Collect table access keys information for operators that can benefit from bucketing
HIVE_STATS_COLLECT_TABLEKEYS("hive.stats.collect.tablekeys", false),
+ // Collect column access information
+ HIVE_STATS_COLLECT_SCANCOLS("hive.stats.collect.scancols", false),
// standard error allowed for ndv estimates. A lower value indicates higher accuracy and a
// higher compute cost.
HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0),
Index: ql/src/test/results/clientpositive/column_access_stats.q.out
===================================================================
--- ql/src/test/results/clientpositive/column_access_stats.q.out (revision 0)
+++ ql/src/test/results/clientpositive/column_access_stats.q.out (working copy)
@@ -0,0 +1,1077 @@
+PREHOOK: query: -- This test is used for testing the ColumnAccessAnalyzer
+
+CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: query: -- Simple select queries
+SELECT key FROM T1 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key
+
+1
+2
+3
+7
+8
+8
+PREHOOK: query: SELECT key, val FROM T1 ORDER BY key, val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key,val
+
+1 11
+2 12
+3 13
+7 17
+8 18
+8 28
+PREHOOK: query: SELECT 1 FROM T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+1
+1
+1
+1
+1
+1
+PREHOOK: query: -- More complicated select queries
+EXPLAIN SELECT key FROM (SELECT key, val FROM T1) subq1 ORDER BY key
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT key FROM (SELECT key, val FROM T1) subq1 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key
+
+1
+2
+3
+7
+8
+8
+PREHOOK: query: EXPLAIN SELECT k FROM (SELECT key as k, val as v FROM T1) subq1 ORDER BY k
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) k) (TOK_SELEXPR (TOK_TABLE_OR_COL val) v)))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL k))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL k)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT k FROM (SELECT key as k, val as v FROM T1) subq1 ORDER BY k
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key
+
+1
+2
+3
+7
+8
+8
+PREHOOK: query: SELECT key + 1 as k FROM T1 ORDER BY k
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key
+
+2.0
+3.0
+4.0
+8.0
+9.0
+9.0
+PREHOOK: query: SELECT key + val as k FROM T1 ORDER BY k
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key,val
+
+12.0
+14.0
+16.0
+24.0
+26.0
+36.0
+PREHOOK: query: -- Work with union
+EXPLAIN
+SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT val as c FROM T1
+) subq1 ORDER BY c
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) c)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val) c))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL c)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery1:subq1-subquery1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ null-subquery2:subq1-subquery2:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: val
+ type: string
+ outputColumnNames: _col0
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT val as c FROM T1
+) subq1 ORDER BY c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key,val
+
+1
+11
+12
+13
+17
+18
+2
+28
+3
+7
+8
+8
+PREHOOK: query: EXPLAIN
+SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT key as c FROM T1
+) subq1 ORDER BY c
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) c)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key) c))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL c)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery1:subq1-subquery1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ null-subquery2:subq1-subquery2:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT key as c FROM T1
+) subq1 ORDER BY c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+Table:t1
+Columns:key
+
+1
+1
+2
+2
+3
+3
+7
+7
+8
+8
+8
+8
+PREHOOK: query: -- Work with insert overwrite
+FROM T1
+INSERT OVERWRITE TABLE T2 SELECT key, count(1) GROUP BY key
+INSERT OVERWRITE TABLE T3 SELECT key, sum(val) GROUP BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t2
+PREHOOK: Output: default@t3
+Table:t1
+Columns:key,val
+
+PREHOOK: query: -- Simple joins
+SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key
+ORDER BY T1.key, T1.val, T2.key, T2.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+Table:t2
+Columns:key,val
+
+Table:t1
+Columns:key,val
+
+1 11 1 1
+2 12 2 1
+3 13 3 1
+7 17 7 1
+8 18 8 2
+8 28 8 2
+PREHOOK: query: EXPLAIN
+SELECT T1.key
+FROM T1 JOIN T2
+ON T1.key = T2.key
+ORDER BY T1.key
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1)) (TOK_TABREF (TOK_TABNAME T2)) (= (. (TOK_TABLE_OR_COL T1) key) (. (TOK_TABLE_OR_COL T2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL T1) key))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL T1) key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ t2
+ TableScan
+ alias: t2
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1
+ handleSkewJoin: false
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT T1.key
+FROM T1 JOIN T2
+ON T1.key = T2.key
+ORDER BY T1.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+Table:t2
+Columns:key
+
+Table:t1
+Columns:key
+
+1
+2
+3
+7
+8
+8
+PREHOOK: query: SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key AND T1.val = T2.val
+ORDER BY T1.key, T1.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+Table:t2
+Columns:key,val
+
+Table:t1
+Columns:key,val
+
+PREHOOK: query: -- Map join
+SELECT /*+ MAPJOIN(a) */ *
+FROM T1 a JOIN T2 b
+ON a.key = b.key
+ORDER BY a.key, a.val, b.key, b.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+Table:t2
+Columns:key,val
+
+Table:t1
+Columns:key,val
+
+1 11 1 1
+2 12 2 1
+3 13 3 1
+7 17 7 1
+8 18 8 2
+8 28 8 2
+PREHOOK: query: -- More joins
+EXPLAIN
+SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key AND T1.val = 3 and T2.val = 3
+ORDER BY T1.key, T1.val
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1)) (TOK_TABREF (TOK_TABNAME T2)) (and (AND (= (. (TOK_TABLE_OR_COL T1) key) (. (TOK_TABLE_OR_COL T2) key)) (= (. (TOK_TABLE_OR_COL T1) val) 3)) (= (. (TOK_TABLE_OR_COL T2) val) 3)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL T1) key)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL T1) val)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (val = 3.0)
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ t2
+ TableScan
+ alias: t2
+ Filter Operator
+ predicate:
+ expr: (val = 3.0)
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ expr: _col3
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key AND T1.val = 3 and T2.val = 3
+ORDER BY T1.key, T1.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+Table:t2
+Columns:key,val
+
+Table:t1
+Columns:key,val
+
+PREHOOK: query: EXPLAIN
+SELECT subq1.val
+FROM
+(
+ SELECT val FROM T1 WHERE key = 5
+) subq1
+JOIN
+(
+ SELECT val FROM T2 WHERE key = 6
+) subq2
+ON subq1.val = subq2.val
+ORDER BY subq1.val
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 5)))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL val))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 6)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) val) (. (TOK_TABLE_OR_COL subq2) val)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) val))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL subq1) val)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Filter Operator
+ predicate:
+ expr: (key = 5.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: val
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ subq2:t2
+ TableScan
+ alias: t2
+ Filter Operator
+ predicate:
+ expr: (key = 6.0)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: val
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1
+ handleSkewJoin: false
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT subq1.val
+FROM
+(
+ SELECT val FROM T1 WHERE key = 5
+) subq1
+JOIN
+(
+ SELECT val FROM T2 WHERE key = 6
+) subq2
+ON subq1.val = subq2.val
+ORDER BY subq1.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+Table:t2
+Columns:key,val
+
+Table:t1
+Columns:key,val
+
+PREHOOK: query: -- Join followed by join
+EXPLAIN
+SELECT *
+FROM
+(
+ SELECT subq1.key as key
+ FROM
+ (
+ SELECT key, val FROM T1
+ ) subq1
+ JOIN
+ (
+ SELECT key, 'teststring' as val FROM T2
+ ) subq2
+ ON subq1.key = subq2.key
+) T4
+JOIN T3
+ON T3.key = T4.key
+ORDER BY T3.key, T4.key
+PREHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) subq1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 'teststring' val)))) subq2) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL subq2) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL subq1) key) key)))) T4) (TOK_TABREF (TOK_TABNAME T3)) (= (. (TOK_TABLE_OR_COL T3) key) (. (TOK_TABLE_OR_COL T4) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL T3) key)) (TOK_TABSORTCOLNAMEASC (. (TOK_TABLE_OR_COL T4) key)))))
+
+STAGE DEPENDENCIES:
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t4:subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ t4:subq2:t2
+ TableScan
+ alias: t2
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1
+ handleSkewJoin: false
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ t3
+ TableScan
+ alias: t3
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ expr: _col0
+ type: string
+ sort order: ++
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT *
+FROM
+(
+ SELECT subq1.key as key
+ FROM
+ (
+ SELECT key, val FROM T1
+ ) subq1
+ JOIN
+ (
+ SELECT key, 'teststring' as val FROM T2
+ ) subq2
+ ON subq1.key = subq2.key
+) T4
+JOIN T3
+ON T3.key = T4.key
+ORDER BY T3.key, T4.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+PREHOOK: Input: default@t3
+#### A masked pattern was here ####
+Table:t3
+Columns:key,val
+
+Table:t2
+Columns:key
+
+Table:t1
+Columns:key
+
+1 1 11.0
+2 2 12.0
+3 3 13.0
+7 7 17.0
+8 8 46.0
+8 8 46.0
Index: ql/src/test/org/apache/hadoop/hive/ql/hooks/CheckColumnAccessHook.java
===================================================================
--- ql/src/test/org/apache/hadoop/hive/ql/hooks/CheckColumnAccessHook.java (revision 0)
+++ ql/src/test/org/apache/hadoop/hive/ql/hooks/CheckColumnAccessHook.java (working copy)
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.hooks;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.QueryPlan;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
+
+import org.apache.hadoop.hive.ql.parse.ColumnAccessInfo;
+import org.mortbay.log.Log;
+
+/*
+ * This hook is used for verifying the column access information
+ * that is generated and maintained in the QueryPlan object by the
+ * ColumnAccessAnalyer. All the hook does is print out the columns
+ * accessed from each table as recorded in the ColumnAccessInfo
+ * in the QueryPlan.
+ */
+public class CheckColumnAccessHook implements ExecuteWithHookContext {
+
+ public void run(HookContext hookContext) {
+ Log.info("Running CheckColumnAccessHook");
+ HiveConf conf = hookContext.getConf();
+ if (conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS) == false) {
+ return;
+ }
+
+ QueryPlan plan = hookContext.getQueryPlan();
+ if (plan == null) {
+ return;
+ }
+
+ ColumnAccessInfo columnAccessInfo = hookContext.getQueryPlan().getColumnAccessInfo();
+ if (columnAccessInfo == null) {
+ return;
+ }
+
+ LogHelper console = SessionState.getConsole();
+ Map> tableToColumnAccessMap =
+ columnAccessInfo.getTableToColumnAccessMap();
+
+ // We need a new map to ensure output is always produced in the same order.
+ // This makes tests that use this hook deterministic.
+ Map outputOrderedMap = new HashMap();
+
+ for (Map.Entry> tableAccess : tableToColumnAccessMap.entrySet()) {
+ StringBuilder perTableInfo = new StringBuilder();
+ perTableInfo.append("Table:").append(tableAccess.getKey()).append("\n");
+ // Sort columns to make output deterministic
+ String[] columns = new String[tableAccess.getValue().size()];
+ tableAccess.getValue().toArray(columns);
+ Arrays.sort(columns);
+ perTableInfo.append("Columns:").append(StringUtils.join(columns, ','))
+ .append("\n");
+ outputOrderedMap.put(tableAccess.getKey(), perTableInfo.toString());
+ }
+
+ for (String perOperatorInfo : outputOrderedMap.values()) {
+ console.printError(perOperatorInfo);
+ }
+ }
+}
Index: ql/src/test/queries/clientpositive/column_access_stats.q
===================================================================
--- ql/src/test/queries/clientpositive/column_access_stats.q (revision 0)
+++ ql/src/test/queries/clientpositive/column_access_stats.q (working copy)
@@ -0,0 +1,158 @@
+SET hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.CheckColumnAccessHook;
+SET hive.stats.collect.scancols=true;
+
+-- This test is used for testing the ColumnAccessAnalyzer
+
+CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE;
+
+-- Simple select queries
+SELECT key FROM T1 ORDER BY key;
+SELECT key, val FROM T1 ORDER BY key, val;
+SELECT 1 FROM T1;
+
+-- More complicated select queries
+EXPLAIN SELECT key FROM (SELECT key, val FROM T1) subq1 ORDER BY key;
+SELECT key FROM (SELECT key, val FROM T1) subq1 ORDER BY key;
+EXPLAIN SELECT k FROM (SELECT key as k, val as v FROM T1) subq1 ORDER BY k;
+SELECT k FROM (SELECT key as k, val as v FROM T1) subq1 ORDER BY k;
+SELECT key + 1 as k FROM T1 ORDER BY k;
+SELECT key + val as k FROM T1 ORDER BY k;
+
+-- Work with union
+EXPLAIN
+SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT val as c FROM T1
+) subq1 ORDER BY c;
+
+SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT val as c FROM T1
+) subq1 ORDER BY c;
+
+EXPLAIN
+SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT key as c FROM T1
+) subq1 ORDER BY c;
+
+SELECT * FROM (
+SELECT key as c FROM T1
+ UNION ALL
+SELECT key as c FROM T1
+) subq1 ORDER BY c;
+
+-- Work with insert overwrite
+FROM T1
+INSERT OVERWRITE TABLE T2 SELECT key, count(1) GROUP BY key
+INSERT OVERWRITE TABLE T3 SELECT key, sum(val) GROUP BY key;
+
+-- Simple joins
+SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key
+ORDER BY T1.key, T1.val, T2.key, T2.val;
+
+EXPLAIN
+SELECT T1.key
+FROM T1 JOIN T2
+ON T1.key = T2.key
+ORDER BY T1.key;
+
+SELECT T1.key
+FROM T1 JOIN T2
+ON T1.key = T2.key
+ORDER BY T1.key;
+
+SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key AND T1.val = T2.val
+ORDER BY T1.key, T1.val;
+
+-- Map join
+SELECT /*+ MAPJOIN(a) */ *
+FROM T1 a JOIN T2 b
+ON a.key = b.key
+ORDER BY a.key, a.val, b.key, b.val;
+
+-- More joins
+EXPLAIN
+SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key AND T1.val = 3 and T2.val = 3
+ORDER BY T1.key, T1.val;
+
+SELECT *
+FROM T1 JOIN T2
+ON T1.key = T2.key AND T1.val = 3 and T2.val = 3
+ORDER BY T1.key, T1.val;
+
+EXPLAIN
+SELECT subq1.val
+FROM
+(
+ SELECT val FROM T1 WHERE key = 5
+) subq1
+JOIN
+(
+ SELECT val FROM T2 WHERE key = 6
+) subq2
+ON subq1.val = subq2.val
+ORDER BY subq1.val;
+
+SELECT subq1.val
+FROM
+(
+ SELECT val FROM T1 WHERE key = 5
+) subq1
+JOIN
+(
+ SELECT val FROM T2 WHERE key = 6
+) subq2
+ON subq1.val = subq2.val
+ORDER BY subq1.val;
+
+-- Join followed by join
+EXPLAIN
+SELECT *
+FROM
+(
+ SELECT subq1.key as key
+ FROM
+ (
+ SELECT key, val FROM T1
+ ) subq1
+ JOIN
+ (
+ SELECT key, 'teststring' as val FROM T2
+ ) subq2
+ ON subq1.key = subq2.key
+) T4
+JOIN T3
+ON T3.key = T4.key
+ORDER BY T3.key, T4.key;
+
+SELECT *
+FROM
+(
+ SELECT subq1.key as key
+ FROM
+ (
+ SELECT key, val FROM T1
+ ) subq1
+ JOIN
+ (
+ SELECT key, 'teststring' as val FROM T2
+ ) subq2
+ ON subq1.key = subq2.key
+) T4
+JOIN T3
+ON T3.key = T4.key
+ORDER BY T3.key, T4.key;
Index: ql/src/java/org/apache/hadoop/hive/ql/QueryPlan.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/QueryPlan.java (revision 1440586)
+++ ql/src/java/org/apache/hadoop/hive/ql/QueryPlan.java (working copy)
@@ -44,6 +44,7 @@
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.ColumnAccessInfo;
import org.apache.hadoop.hive.ql.parse.TableAccessInfo;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.ReducerTimeStatsPerJob;
@@ -84,6 +85,7 @@
*/
protected LineageInfo linfo;
private TableAccessInfo tableAccessInfo;
+ private ColumnAccessInfo columnAccessInfo;
private HashMap idToTableNameMap;
@@ -113,6 +115,7 @@
outputs = sem.getOutputs();
linfo = sem.getLineageInfo();
tableAccessInfo = sem.getTableAccessInfo();
+ columnAccessInfo = sem.getColumnAccessInfo();
idToTableNameMap = new HashMap(sem.getIdToTableNameMap());
queryId = makeQueryId();
@@ -777,6 +780,25 @@
this.tableAccessInfo = tableAccessInfo;
}
+ /**
+ * Gets the column access information.
+ *
+ * @return ColumnAccessInfo associated with the query.
+ */
+ public ColumnAccessInfo getColumnAccessInfo() {
+ return columnAccessInfo;
+ }
+
+ /**
+ * Sets the column access information.
+ *
+ * @param columnAccessInfo The ColumnAccessInfo structure that is set immediately after
+ * the optimization phase.
+ */
+ public void setColumnAccessInfo(ColumnAccessInfo columnAccessInfo) {
+ this.columnAccessInfo = columnAccessInfo;
+ }
+
public QueryProperties getQueryProperties() {
return queryProperties;
}
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessInfo.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessInfo.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessInfo.java (working copy)
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+public class ColumnAccessInfo {
+ /**
+ * Map of table name to names of accessed columns
+ */
+ private final Map> tableToColumnAccessMap;
+
+ public ColumnAccessInfo() {
+ tableToColumnAccessMap = new HashMap>();
+ }
+
+ public void add(String table, String col) {
+ Set tableColumns = tableToColumnAccessMap.get(table);
+ if (tableColumns == null) {
+ tableColumns = new HashSet();
+ tableToColumnAccessMap.put(table, tableColumns);
+ }
+ tableColumns.add(col);
+ }
+
+ public Map> getTableToColumnAccessMap() {
+ return tableToColumnAccessMap;
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1440586)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy)
@@ -8473,6 +8473,13 @@
optm.initialize(conf);
pCtx = optm.optimize();
+ // Generate column access stats if required - wait until column pruning takes place
+ // during optimization
+ if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS) == true) {
+ ColumnAccessAnalyzer columnAccessAnalyzer = new ColumnAccessAnalyzer(pCtx);
+ setColumnAccessInfo(columnAccessAnalyzer.analyzeColumnAccess());
+ }
+
// At this point we have the complete operator tree
// from which we want to find the reduce operator
genMapRedTasks(pCtx);
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java (revision 1440586)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java (working copy)
@@ -97,6 +97,7 @@
*/
protected LineageInfo linfo;
protected TableAccessInfo tableAccessInfo;
+ protected ColumnAccessInfo columnAccessInfo;
protected static final String TEXTFILE_INPUT = TextInputFormat.class
.getName();
@@ -830,6 +831,25 @@
this.tableAccessInfo = tableAccessInfo;
}
+ /**
+ * Gets the column access information.
+ *
+ * @return ColumnAccessInfo associated with the query.
+ */
+ public ColumnAccessInfo getColumnAccessInfo() {
+ return columnAccessInfo;
+ }
+
+ /**
+ * Sets the column access information.
+ *
+ * @param columnAccessInfo The ColumnAccessInfo structure that is set immediately after
+ * the optimization phase.
+ */
+ public void setColumnAccessInfo(ColumnAccessInfo columnAccessInfo) {
+ this.columnAccessInfo = columnAccessInfo;
+ }
+
protected HashMap extractPartitionSpecs(Tree partspec)
throws SemanticException {
HashMap partSpec = new LinkedHashMap();
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessAnalyzer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessAnalyzer.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnAccessAnalyzer.java (working copy)
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.parse;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.Table;
+
+public class ColumnAccessAnalyzer {
+ private static final Log LOG = LogFactory.getLog(ColumnAccessAnalyzer.class.getName());
+ private final ParseContext pGraphContext;
+
+ public ColumnAccessAnalyzer() {
+ pGraphContext = null;
+ }
+
+ public ColumnAccessAnalyzer(ParseContext pactx) {
+ pGraphContext = pactx;
+ }
+
+ public ColumnAccessInfo analyzeColumnAccess() throws SemanticException {
+ ColumnAccessInfo columnAccessInfo = new ColumnAccessInfo();
+ Map topOps = pGraphContext.getTopToTable();
+ for (TableScanOperator op : topOps.keySet()) {
+ Table table = topOps.get(op);
+ String tableName = table.getTableName();
+ List tableCols = table.getAllCols();
+ for (int i : op.getNeededColumnIDs()) {
+ columnAccessInfo.add(tableName, tableCols.get(i).getName());
+ }
+ }
+ return columnAccessInfo;
+ }
+}