From fe94600fcffa481847f4480aa1d3530d046c8144 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Wed, 16 Dec 2015 21:00:21 -0800 Subject: [PATCH] HIVE-12590 : Repeated UDAFs with literals can produce incorrect result --- .../org/apache/hadoop/hive/ql/parse/ASTNode.java | 27 ++ .../apache/hadoop/hive/ql/parse/RowResolver.java | 30 +- .../hadoop/hive/ql/parse/SemanticAnalyzer.java | 2 +- .../queries/clientpositive/groupby_duplicate_key.q | 4 + .../clientpositive/groupby_duplicate_key.q.out | 71 +++++ 5 files changed, 309 insertions(+), 151 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java index b96e2eb..573c6db 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java @@ -263,4 +263,31 @@ private String toStringTree(ASTNode rootNode) { endIndx = rootNode.getMemoizedStringLen(); return rootNode.getMemoizedSubString(startIndx, endIndx); } + + public String toLowerStringTree() { + if ( children==null || children.size()==0 ) { + return this.getType() == HiveParser.StringLiteral ? this.toString() : this.toString().toLowerCase(); + } + StringBuffer buf = new StringBuffer(); + if ( !isNil() ) { + buf.append("("); + if (this.getType() == HiveParser.StringLiteral) { + buf.append(this.toString()); + } else { + buf.append(this.toString().toLowerCase()); + } + buf.append(' '); + } + for (int i = 0; children!=null && i < children.size(); i++) { + ASTNode t = (ASTNode)children.get(i); + if ( i>0 ) { + buf.append(' '); + } + buf.append(t.toLowerStringTree()); + } + if ( !isNil() ) { + buf.append(")"); + } + return buf.toString(); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java index 891b1f7..61907d5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java @@ -75,7 +75,7 @@ public RowResolver() { public void putExpression(ASTNode node, ColumnInfo colInfo) { String treeAsString = node.toStringTree(); expressionMap.put(treeAsString, node); - put("", treeAsString, colInfo); + putColInfo("", node.toLowerStringTree(), colInfo, false); } /** @@ -83,7 +83,7 @@ public void putExpression(ASTNode node, ColumnInfo colInfo) { * exactly matches the string rendering of the given ASTNode. */ public ColumnInfo getExpression(ASTNode node) throws SemanticException { - return get("", node.toStringTree()); + return getColInfo("", node.toLowerStringTree()); } /** @@ -94,26 +94,37 @@ public ASTNode getExpressionSource(ASTNode node) { return expressionMap.get(node.toStringTree()); } - public void put(String tab_alias, String col_alias, ColumnInfo colInfo) { - if (!addMappingOnly(tab_alias, col_alias, colInfo)) { + private void putColInfo(String tab_alias, String col_alias, ColumnInfo colInfo, boolean caseInsensitive) { + if (!addMappingOnlyInternal(tab_alias, col_alias, colInfo, caseInsensitive)) { //Make sure that the table alias and column alias are stored //in the column info if (tab_alias != null) { colInfo.setTabAlias(tab_alias.toLowerCase()); } + if (col_alias != null) { - colInfo.setAlias(col_alias.toLowerCase()); + colInfo.setAlias(caseInsensitive ? col_alias.toLowerCase() : col_alias); } rowSchema.getSignature().add(colInfo); } } + public void put(String tab_alias, String col_alias, ColumnInfo colInfo) { + putColInfo(tab_alias, col_alias, colInfo, true); + } + public boolean addMappingOnly(String tab_alias, String col_alias, ColumnInfo colInfo) { + return addMappingOnlyInternal(tab_alias, col_alias, colInfo, true); + } + + private boolean addMappingOnlyInternal(String tab_alias, String col_alias, ColumnInfo colInfo, boolean caseInsensitive) { if (tab_alias != null) { tab_alias = tab_alias.toLowerCase(); } - col_alias = col_alias.toLowerCase(); + if (col_alias != null && caseInsensitive) { + col_alias = col_alias.toLowerCase(); + } /* * allow multiple mappings to the same ColumnInfo. * When a ColumnInfo is mapped multiple times, only the @@ -169,7 +180,11 @@ public boolean hasTableAlias(String tab_alias) { * @throws SemanticException */ public ColumnInfo get(String tab_alias, String col_alias) throws SemanticException { - col_alias = col_alias.toLowerCase(); + return getColInfo(tab_alias, col_alias.toLowerCase()); + + } + + private ColumnInfo getColInfo(String tab_alias, String col_alias) throws SemanticException{ ColumnInfo ret = null; if (tab_alias != null) { @@ -203,6 +218,7 @@ public ColumnInfo get(String tab_alias, String col_alias) throws SemanticExcepti } return ret; + } public ArrayList getColumnInfos() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 27549dc..90a1f69 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -542,7 +542,7 @@ private void doPhase1GetAllAggregations(ASTNode expressionTree, if(containsLeadLagUDF(expressionTree)) { throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName)); } - aggregations.put(expressionTree.toStringTree().toLowerCase(), expressionTree); + aggregations.put(expressionTree.toLowerStringTree(), expressionTree); FunctionInfo fi = FunctionRegistry.getFunctionInfo(functionName); if (!fi.isNative()) { unparseTranslator.addIdentifierTranslation((ASTNode) expressionTree diff --git a/ql/src/test/queries/clientpositive/groupby_duplicate_key.q b/ql/src/test/queries/clientpositive/groupby_duplicate_key.q index 7f38efe..1909873 100644 --- a/ql/src/test/queries/clientpositive/groupby_duplicate_key.q +++ b/ql/src/test/queries/clientpositive/groupby_duplicate_key.q @@ -11,3 +11,7 @@ create table dummy as select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows); select key,dummy1,dummy2 from dummy; + +explain +select max('pants'), max('pANTS') from src group by key limit 1; +select max('pants'), max('pANTS') from src group by key limit 1; diff --git a/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out b/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out index fc95f41..8ca8866 100644 --- a/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out +++ b/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out @@ -175,3 +175,74 @@ POSTHOOK: Input: default@dummy 484 X X 86 X X 98 X X +PREHOOK: query: explain +select max('pants'), max('pANTS') from src group by key limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select max('pants'), max('pANTS') from src group by key limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max('pants'), max('pANTS') + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: string), _col2 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), max(VALUE._col1) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select max('pants'), max('pANTS') from src group by key limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max('pants'), max('pANTS') from src group by key limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +pants pANTS -- 1.7.12.4 (Apple Git-37)