diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java index 5b77e6f..32ad4b2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java @@ -54,16 +54,15 @@ private boolean isRewritten; private boolean isTableLevel; - private String tableName; private List colNames; private List colType; private String partName; + private Table tbl; private class PartitionList { private final String[] partKeys; - private String[] partKeyTypes; private final String[] partValues; - private int numPartitions; + private final int numPartitions; private int numPartitionValues; PartitionList(int numPartitions) { @@ -76,10 +75,6 @@ public int getNumPartitions() { return numPartitions; } - public void setNumPartitions(int numPartitions) { - this.numPartitions = numPartitions; - } - public String[] getPartValues() { return partValues; } @@ -103,18 +98,6 @@ public int getNumPartValues() { public void setNumPartValues(int numPartValues) { numPartitionValues = numPartValues; } - - public String[] getPartKeyTypes() { - return partKeyTypes; - } - - public void setPartKeyTypes(String[] partKeyTypes) { - this.partKeyTypes = partKeyTypes; - } - - public void setPartKeyType(String partKeyType, int index) { - partKeyTypes[index] = partKeyType; - } } public ColumnStatsSemanticAnalyzer(HiveConf conf) throws SemanticException { @@ -130,7 +113,7 @@ private boolean shouldRewrite(ASTNode tree) { child0 = (ASTNode) child0.getChild(0); if (child0.getToken().getType() == HiveParser.TOK_TABNAME) { child1 = (ASTNode) tree.getChild(1); - if (child1.getToken().getType() == HiveParser.TOK_TABCOLNAME) { + if (child1.getToken().getType() == HiveParser.KW_COLUMNS) { rwt = true; } } @@ -151,8 +134,13 @@ private boolean isPartitionLevelStats(ASTNode tree) { return isPartitioned; } - private String getTableName(ASTNode tree) { - return getUnescapedName((ASTNode) tree.getChild(0).getChild(0)); + private Table getTable(ASTNode tree) throws SemanticException { + String tableName = getUnescapedName((ASTNode) tree.getChild(0).getChild(0)); + try { + return db.getTable(tableName); + } catch (HiveException e) { + throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); + } } private PartitionList getPartKeyValuePairsFromAST(ASTNode tree) { @@ -180,27 +168,25 @@ private PartitionList getPartKeyValuePairsFromAST(ASTNode tree) { return partList; } - private List getColumnName(ASTNode tree) { - int numCols = tree.getChild(1).getChildCount(); - List colName = new LinkedList(); - for (int i = 0; i < numCols; i++) { - colName.add(i, new String(getUnescapedName((ASTNode) tree.getChild(1).getChild(i)))); - } - return colName; - } + private List getColumnName(ASTNode tree) throws SemanticException{ - private int getNumColumns(ASTNode tree) { - return tree.getChild(1).getChildCount(); + switch (tree.getChildCount()) { + case 2: + return Utilities.getColumnNamesFromFieldSchema(tbl.getCols()); + case 3: + int numCols = tree.getChild(2).getChildCount(); + List colName = new LinkedList(); + for (int i = 0; i < numCols; i++) { + colName.add(i, new String(getUnescapedName((ASTNode) tree.getChild(2).getChild(i)))); + } + return colName; + default: + throw new SemanticException("Internal error"); + } } - private void validatePartitionKeys(String tableName, PartitionList partList) throws + private void validatePartitionKeys(PartitionList partList) throws SemanticException { - Table tbl; - try { - tbl = db.getTable(tableName); - } catch (HiveException e) { - throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); - } List partKeys = tbl.getPartitionKeys(); String[] inputPartKeys = partList.getPartKeys(); @@ -221,15 +207,8 @@ private void validatePartitionKeys(String tableName, PartitionList partList) thr } } - private String[] getPartitionKeysType(String tableName, PartitionList partList) throws + private String[] getPartitionKeysType(PartitionList partList) throws SemanticException { - Table tbl; - try { - tbl = db.getTable(tableName); - } catch (HiveException e) { - throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); - } - List partKeys = tbl.getPartitionKeys(); String[] inputPartKeys = partList.getPartKeys(); String[] inputPartKeyTypes = new String[inputPartKeys.length]; @@ -245,20 +224,13 @@ private void validatePartitionKeys(String tableName, PartitionList partList) thr return inputPartKeyTypes; } - private String constructPartitionName(String tableName, PartitionList partList) + private String constructPartitionName(PartitionList partList) throws SemanticException { - Table tbl; Partition part; String[] partKeys = partList.getPartKeys(); String[] partValues = partList.getPartValues(); - - try { - tbl = db.getTable(tableName); - } catch (HiveException e) { - throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); - } - Map partSpec = new LinkedHashMap(); + for (int i=0; i getTableColumnType(String tableName, List colNames, int numCols) + private List getTableColumnType(List colNames) throws SemanticException{ List colTypes = new LinkedList(); - String colName; - Table tbl; - try { - tbl = db.getTable(tableName); - } catch (HiveException e) { - throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); - } - List cols = tbl.getCols(); - for (int i=0; i getPartitionColumnType(String tableName, String partName, - List colNames, int numCols) throws SemanticException { + private List getPartitionColumnType(String partName, + List colNames) throws SemanticException { List colTypes = new LinkedList(); - String colName; - Table tbl; - try { - tbl = db.getTable(tableName); - } catch (HiveException e) { - throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); - } - List partNames = new ArrayList(); partNames.add(partName); List partitionList; @@ -443,11 +398,10 @@ private int getNumBitVectorsForNDVEstimation(HiveConf conf) throws SemanticExcep Partition part = partitionList.get(0); List cols = part.getCols(); - for (int i=0; i colNames, int numBitVectors, Parti rewrittenQueryBuilder.append(" )"); } rewrittenQueryBuilder.append(" from "); - rewrittenQueryBuilder.append(tableName); + rewrittenQueryBuilder.append(tbl.getTableName()); isRewritten = true; // If partition level statistics is requested, add predicate and group by as needed to rewritten @@ -514,25 +468,24 @@ public ColumnStatsSemanticAnalyzer(HiveConf conf, ASTNode tree) throws SemanticE * an aggregation. */ if (shouldRewrite(tree)) { - tableName = new String(getTableName(tree)); + tbl = getTable(tree); colNames = getColumnName(tree); - int numCols = getNumColumns(tree); // Save away the original AST originalTree = tree; boolean isPartitionStats = isPartitionLevelStats(tree); PartitionList partList = null; - checkForPartitionColumns(colNames, getPartitionKeys(tableName)); - validateSpecifiedColumnNames(tableName, colNames); + checkForPartitionColumns(colNames, Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys())); + validateSpecifiedColumnNames(colNames); if (isPartitionStats) { isTableLevel = false; partList = getPartKeyValuePairsFromAST(tree); - validatePartitionClause(tableName, partList); - partName = constructPartitionName(tableName, partList); - colType = getPartitionColumnType(tableName, partName, colNames, numCols); + validatePartitionClause(partList); + partName = constructPartitionName(partList); + colType = getPartitionColumnType(partName, colNames); } else { isTableLevel = true; - colType = getTableColumnType(tableName, colNames, numCols); + colType = getTableColumnType(colNames); } int numBitVectors = getNumBitVectorsForNDVEstimation(conf); @@ -547,16 +500,9 @@ public ColumnStatsSemanticAnalyzer(HiveConf conf, ASTNode tree) throws SemanticE } // fail early if the columns specified for column statistics are not valid - private void validateSpecifiedColumnNames(String tableName, List specifiedCols) + private void validateSpecifiedColumnNames(List specifiedCols) throws SemanticException { - List fields = null; - try { - fields = db.getTable(tableName).getAllCols(); - } catch (HiveException e) { - throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); - } - List tableCols = Utilities.getColumnNamesFromFieldSchema(fields); - + List tableCols = Utilities.getColumnNamesFromFieldSchema(tbl.getCols()); for(String sc : specifiedCols) { if (!tableCols.contains(sc.toLowerCase())) { String msg = "'" + sc + "' (possible columns are " + tableCols.toString() + ")"; @@ -565,17 +511,6 @@ private void validateSpecifiedColumnNames(String tableName, List specifi } } - private List getPartitionKeys(String tableName) throws SemanticException { - List fields; - try { - fields = db.getTable(tableName).getPartitionKeys(); - } catch (HiveException e) { - throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName)); - } - - return Utilities.getColumnNamesFromFieldSchema(fields); - } - private void checkForPartitionColumns(List specifiedCols, List partCols) throws SemanticException { // Raise error if user has specified partition column for stats @@ -602,7 +537,7 @@ public void analyze(ASTNode ast, Context origCtx) throws SemanticException { qb = getQB(); qb.setAnalyzeRewrite(true); qbp = qb.getParseInfo(); - qbp.setTableName(tableName); + qbp.setTableName(tbl.getTableName()); qbp.setTblLvl(isTableLevel); if (!isTableLevel) { diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index 6d958fd..f934ac4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -1312,7 +1312,9 @@ descStatement analyzeStatement @init { pushMsg("analyze statement", state); } @after { popMsg(state); } - : KW_ANALYZE KW_TABLE (parttype=tableOrPartition) KW_COMPUTE KW_STATISTICS ((noscan=KW_NOSCAN) | (partialscan=KW_PARTIALSCAN) | (KW_FOR KW_COLUMNS statsColumnName=columnNameList))? -> ^(TOK_ANALYZE $parttype $noscan? $partialscan? $statsColumnName?) + : KW_ANALYZE KW_TABLE (parttype=tableOrPartition) KW_COMPUTE KW_STATISTICS ((noscan=KW_NOSCAN) | (partialscan=KW_PARTIALSCAN) + | (KW_FOR KW_COLUMNS (statsColumnName=columnNameList)?))? + -> ^(TOK_ANALYZE $parttype $noscan? $partialscan? KW_COLUMNS? $statsColumnName?) ; showStatement diff --git ql/src/test/queries/clientpositive/columnstats_partlvl.q ql/src/test/queries/clientpositive/columnstats_partlvl.q index 9dfe8ff..14ce2a8 100644 --- ql/src/test/queries/clientpositive/columnstats_partlvl.q +++ ql/src/test/queries/clientpositive/columnstats_partlvl.q @@ -18,3 +18,9 @@ analyze table Employee_Part partition (employeeSalary=4000.0) compute statistics explain extended analyze table Employee_Part partition (employeeSalary=4000.0) compute statistics for columns employeeID; analyze table Employee_Part partition (employeeSalary=4000.0) compute statistics for columns employeeID; + +explain +analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns; +analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns; + + diff --git ql/src/test/queries/clientpositive/columnstats_tbllvl.q ql/src/test/queries/clientpositive/columnstats_tbllvl.q index 170fbc5..2039c07 100644 --- ql/src/test/queries/clientpositive/columnstats_tbllvl.q +++ ql/src/test/queries/clientpositive/columnstats_tbllvl.q @@ -23,6 +23,11 @@ analyze table UserVisits_web_text_none compute statistics for columns sourceIP, analyze table UserVisits_web_text_none compute statistics for columns sourceIP, avgTimeOnSite, adRevenue; +explain +analyze table UserVisits_web_text_none compute statistics for columns; + +analyze table UserVisits_web_text_none compute statistics for columns; + CREATE TABLE empty_tab( a int, b double, diff --git ql/src/test/results/clientpositive/columnstats_partlvl.q.out ql/src/test/results/clientpositive/columnstats_partlvl.q.out index d91be8d..a4c4677 100644 --- ql/src/test/results/clientpositive/columnstats_partlvl.q.out +++ ql/src/test/results/clientpositive/columnstats_partlvl.q.out @@ -76,7 +76,7 @@ STAGE PLANS: Columns: employeeID Column Types: int Partition: employeesalary=2000.0 - Table: Employee_Part + Table: employee_part PREHOOK: query: explain extended analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns employeeID @@ -94,6 +94,7 @@ TOK_ANALYZE TOK_PARTVAL employeeSalary 2000.0 + columns TOK_TABCOLNAME employeeID @@ -208,7 +209,7 @@ STAGE PLANS: Columns: employeeID Column Types: int Partition: employeesalary=2000.0 - Table: Employee_Part + Table: employee_part Is Table Level Stats: false PREHOOK: query: analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns employeeID @@ -268,7 +269,7 @@ STAGE PLANS: Columns: employeeID Column Types: int Partition: employeesalary=4000.0 - Table: Employee_Part + Table: employee_part PREHOOK: query: explain extended analyze table Employee_Part partition (employeeSalary=4000.0) compute statistics for columns employeeID @@ -286,6 +287,7 @@ TOK_ANALYZE TOK_PARTVAL employeeSalary 4000.0 + columns TOK_TABCOLNAME employeeID @@ -400,7 +402,7 @@ STAGE PLANS: Columns: employeeID Column Types: int Partition: employeesalary=4000.0 - Table: Employee_Part + Table: employee_part Is Table Level Stats: false PREHOOK: query: analyze table Employee_Part partition (employeeSalary=4000.0) compute statistics for columns employeeID @@ -413,3 +415,62 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@employee_part POSTHOOK: Input: default@employee_part@employeesalary=4000.0 #### A masked pattern was here #### +PREHOOK: query: explain +analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns +PREHOOK: type: QUERY +POSTHOOK: query: explain +analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: employee_part + Select Operator + expressions: employeeid (type: int), employeename (type: string) + outputColumnNames: employeeid, employeename + Group By Operator + aggregations: compute_stats(employeeid, 16), compute_stats(employeename, 16) + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + sort order: + value expressions: _col0 (type: struct), _col1 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: struct), _col1 (type: struct) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Column Stats Work + Column Stats Desc: + Columns: employeeid, employeename + Column Types: int, string + Partition: employeesalary=2000.0 + Table: employee_part + +PREHOOK: query: analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@employee_part +PREHOOK: Input: default@employee_part@employeesalary=2000.0 +#### A masked pattern was here #### +POSTHOOK: query: analyze table Employee_Part partition (employeeSalary=2000.0) compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee_part +POSTHOOK: Input: default@employee_part@employeesalary=2000.0 +#### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/columnstats_tbllvl.q.out ql/src/test/results/clientpositive/columnstats_tbllvl.q.out index 3d3d0e2..e1885df 100644 --- ql/src/test/results/clientpositive/columnstats_tbllvl.q.out +++ ql/src/test/results/clientpositive/columnstats_tbllvl.q.out @@ -83,7 +83,7 @@ STAGE PLANS: Column Stats Desc: Columns: sourceIP, avgTimeOnSite, adRevenue Column Types: string, int, float - Table: UserVisits_web_text_none + Table: uservisits_web_text_none PREHOOK: query: explain extended analyze table UserVisits_web_text_none compute statistics for columns sourceIP, avgTimeOnSite, adRevenue @@ -97,6 +97,7 @@ TOK_ANALYZE TOK_TAB TOK_TABNAME UserVisits_web_text_none + columns TOK_TABCOLNAME sourceIP avgTimeOnSite @@ -211,7 +212,7 @@ STAGE PLANS: Column Stats Desc: Columns: sourceIP, avgTimeOnSite, adRevenue Column Types: string, int, float - Table: UserVisits_web_text_none + Table: uservisits_web_text_none Is Table Level Stats: true PREHOOK: query: analyze table UserVisits_web_text_none compute statistics for columns sourceIP, avgTimeOnSite, adRevenue @@ -222,6 +223,62 @@ POSTHOOK: query: analyze table UserVisits_web_text_none compute statistics for c POSTHOOK: type: QUERY POSTHOOK: Input: default@uservisits_web_text_none #### A masked pattern was here #### +PREHOOK: query: explain +analyze table UserVisits_web_text_none compute statistics for columns +PREHOOK: type: QUERY +POSTHOOK: query: explain +analyze table UserVisits_web_text_none compute statistics for columns +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: uservisits_web_text_none + Select Operator + expressions: sourceip (type: string), desturl (type: string), visitdate (type: string), adrevenue (type: float), useragent (type: string), ccode (type: string), lcode (type: string), skeyword (type: string), avgtimeonsite (type: int) + outputColumnNames: sourceip, desturl, visitdate, adrevenue, useragent, ccode, lcode, skeyword, avgtimeonsite + Group By Operator + aggregations: compute_stats(sourceip, 16), compute_stats(desturl, 16), compute_stats(visitdate, 16), compute_stats(adrevenue, 16), compute_stats(useragent, 16), compute_stats(ccode, 16), compute_stats(lcode, 16), compute_stats(skeyword, 16), compute_stats(avgtimeonsite, 16) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Reduce Output Operator + sort order: + value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2), compute_stats(VALUE._col3), compute_stats(VALUE._col4), compute_stats(VALUE._col5), compute_stats(VALUE._col6), compute_stats(VALUE._col7), compute_stats(VALUE._col8) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Select Operator + expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct), _col3 (type: struct), _col4 (type: struct), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Column Stats Work + Column Stats Desc: + Columns: sourceip, desturl, visitdate, adrevenue, useragent, ccode, lcode, skeyword, avgtimeonsite + Column Types: string, string, string, float, string, string, string, string, int + Table: uservisits_web_text_none + +PREHOOK: query: analyze table UserVisits_web_text_none compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@uservisits_web_text_none +#### A masked pattern was here #### +POSTHOOK: query: analyze table UserVisits_web_text_none compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@uservisits_web_text_none +#### A masked pattern was here #### PREHOOK: query: CREATE TABLE empty_tab( a int, b double,