From e3819a0fda1a2edf4a1640199d640d9240876be8 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Wed, 16 Dec 2015 21:00:21 -0800 Subject: [PATCH] HIVE-12590 : Repeated UDAFs with literals can produce incorrect result --- .../org/apache/hadoop/hive/ql/parse/ASTNode.java | 25 ++++---- .../apache/hadoop/hive/ql/parse/RowResolver.java | 4 +- .../hadoop/hive/ql/parse/SemanticAnalyzer.java | 14 ++--- .../hadoop/hive/ql/parse/TypeCheckProcFactory.java | 4 +- .../queries/clientpositive/groupby_duplicate_key.q | 4 ++ .../results/clientpositive/case_sensitivity.q.out | 2 +- .../clientpositive/groupby_duplicate_key.q.out | 71 ++++++++++++++++++++++ .../tez/schema_evol_orc_acid_mapwork_table.q.out | 6 +- .../schema_evol_orc_acidvec_mapwork_table.q.out | 6 +- 9 files changed, 106 insertions(+), 30 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java index b96e2eb..f92767d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java @@ -32,7 +32,7 @@ */ public class ASTNode extends CommonTree implements Node,Serializable { private static final long serialVersionUID = 1L; - private transient StringBuffer astStr; + private transient StringBuilder astStr; private transient ASTNodeOrigin origin; private transient int startIndx = -1; private transient int endIndx = -1; @@ -133,10 +133,11 @@ private StringBuilder dump(StringBuilder sb, String ws) { return sb; } - private ASTNode getRootNodeWithValidASTStr (boolean useMemoizedRoot) { - if (useMemoizedRoot && rootNode != null && rootNode.parent == null && + private void getRootNodeWithValidASTStr () { + + if (rootNode != null && rootNode.parent == null && rootNode.hasValidMemoizedString()) { - return rootNode; + return; } ASTNode retNode = this; while (retNode.parent != null) { @@ -144,11 +145,11 @@ private ASTNode getRootNodeWithValidASTStr (boolean useMemoizedRoot) { } rootNode=retNode; if (!rootNode.isValidASTStr) { - rootNode.astStr = new StringBuffer(); + rootNode.astStr = new StringBuilder(); rootNode.toStringTree(rootNode); rootNode.isValidASTStr = true; } - return retNode; + return; } private boolean hasValidMemoizedString() { @@ -174,7 +175,7 @@ private String getMemoizedSubString(int start, int end) { private void addtoMemoizedString(String string) { if (astStr == null) { - astStr = new StringBuffer(); + astStr = new StringBuilder(); } astStr.append(string); } @@ -227,7 +228,7 @@ public String toStringTree() { // The root might have changed because of tree modifications. // Compute the new root for this tree and set the astStr. - getRootNodeWithValidASTStr(true); + getRootNodeWithValidASTStr(); // If rootNotModified is false, then startIndx and endIndx will be stale. if (startIndx >= 0 && endIndx <= rootNode.getMemoizedStringLen()) { @@ -241,13 +242,15 @@ private String toStringTree(ASTNode rootNode) { startIndx = rootNode.getMemoizedStringLen(); // Leaf node if ( children==null || children.size()==0 ) { - rootNode.addtoMemoizedString(this.toString()); + rootNode.addtoMemoizedString(this.getType() != HiveParser.StringLiteral ? this.toString().toLowerCase() : this.toString()); endIndx = rootNode.getMemoizedStringLen(); - return this.toString(); + return this.getType() != HiveParser.StringLiteral ? this.toString().toLowerCase() : this.toString(); } + String str; if ( !isNil() ) { rootNode.addtoMemoizedString("("); - rootNode.addtoMemoizedString(this.toString()); + str = this.toString(); + rootNode.addtoMemoizedString((this.getType() == HiveParser.StringLiteral || null == str) ? str : str.toLowerCase()); rootNode.addtoMemoizedString(" "); } for (int i = 0; children!=null && i < children.size(); i++) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java index 891b1f7..0bd036f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/RowResolver.java @@ -112,7 +112,6 @@ public boolean addMappingOnly(String tab_alias, String col_alias, ColumnInfo col if (tab_alias != null) { tab_alias = tab_alias.toLowerCase(); } - col_alias = col_alias.toLowerCase(); /* * allow multiple mappings to the same ColumnInfo. @@ -169,7 +168,6 @@ public boolean hasTableAlias(String tab_alias) { * @throws SemanticException */ public ColumnInfo get(String tab_alias, String col_alias) throws SemanticException { - col_alias = col_alias.toLowerCase(); ColumnInfo ret = null; if (tab_alias != null) { @@ -476,4 +474,4 @@ public RowResolver duplicate() { resolver.isExprResolver = isExprResolver; return resolver; } -} +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index ab9271f..4c4f291 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -542,7 +542,7 @@ private void doPhase1GetAllAggregations(ASTNode expressionTree, if(containsLeadLagUDF(expressionTree)) { throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName)); } - aggregations.put(expressionTree.toStringTree().toLowerCase(), expressionTree); + aggregations.put(expressionTree.toStringTree(), expressionTree); FunctionInfo fi = FunctionRegistry.getFunctionInfo(functionName); if (!fi.isNative()) { unparseTranslator.addIdentifierTranslation((ASTNode) expressionTree @@ -3530,7 +3530,7 @@ public static int setBit(int bitmap, int bitIdx) { (selExpr.getChildCount() == 3 && selExpr.getChild(2).getType() == HiveParser.TOK_WINDOWSPEC)) { // return zz for "xx + yy AS zz" - colAlias = unescapeIdentifier(selExpr.getChild(1).getText()); + colAlias = unescapeIdentifier(selExpr.getChild(1).getText().toLowerCase()); colRef[0] = tabAlias; colRef[1] = colAlias; return colRef; @@ -3539,7 +3539,7 @@ public static int setBit(int bitmap, int bitIdx) { ASTNode root = (ASTNode) selExpr.getChild(0); if (root.getType() == HiveParser.TOK_TABLE_OR_COL) { colAlias = - BaseSemanticAnalyzer.unescapeIdentifier(root.getChild(0).getText()); + BaseSemanticAnalyzer.unescapeIdentifier(root.getChild(0).getText().toLowerCase()); colRef[0] = tabAlias; colRef[1] = colAlias; return colRef; @@ -3557,7 +3557,7 @@ public static int setBit(int bitmap, int bitIdx) { // Return zz for "xx.zz" and "xx.yy.zz" ASTNode col = (ASTNode) root.getChild(1); if (col.getType() == HiveParser.Identifier) { - colAlias = unescapeIdentifier(col.getText()); + colAlias = unescapeIdentifier(col.getText().toLowerCase()); } } @@ -3567,7 +3567,7 @@ public static int setBit(int bitmap, int bitIdx) { String expr_flattened = root.toStringTree(); // remove all TOK tokens - String expr_no_tok = expr_flattened.replaceAll("TOK_\\S+", ""); + String expr_no_tok = expr_flattened.replaceAll("tok_\\S+", ""); // remove all non alphanumeric letters, replace whitespace spans with underscore String expr_formatted = expr_no_tok.replaceAll("\\W", " ").trim().replaceAll("\\s+", "_"); @@ -3705,7 +3705,7 @@ static boolean isRegex(String pattern, HiveConf conf) { ASTNode selExprChild = (ASTNode) selExpr.getChild(i); switch (selExprChild.getType()) { case HiveParser.Identifier: - udtfColAliases.add(unescapeIdentifier(selExprChild.getText())); + udtfColAliases.add(unescapeIdentifier(selExprChild.getText().toLowerCase())); unparseTranslator.addIdentifierTranslation(selExprChild); break; case HiveParser.TOK_TABALIAS: @@ -5381,7 +5381,7 @@ private Operator genGroupByPlan1ReduceMultiGBY(List dests, QB qb, Operat if (!groupingSets.isEmpty()) { throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_AGGR_NOMAPAGGR_MULTIGBY.getMsg()); } - + ASTNode whereExpr = parseInfo.getWhrForClause(dest); if (whereExpr != null) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java index 9d8b352..598520c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java @@ -378,7 +378,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, default: // HiveParser.identifier | HiveParse.KW_IF | HiveParse.KW_LEFT | // HiveParse.KW_RIGHT - str = BaseSemanticAnalyzer.unescapeIdentifier(expr.getText()); + str = BaseSemanticAnalyzer.unescapeIdentifier(expr.getText().toLowerCase()); break; } return new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, str); @@ -818,7 +818,7 @@ static ExprNodeDesc getFuncExprNodeDescWithUdfData(String udfName, TypeInfo type ((SettableUDF)genericUDF).setTypeInfo(typeInfo); } } - + List childrenList = new ArrayList(children.length); childrenList.addAll(Arrays.asList(children)); diff --git a/ql/src/test/queries/clientpositive/groupby_duplicate_key.q b/ql/src/test/queries/clientpositive/groupby_duplicate_key.q index 7f38efe..1909873 100644 --- a/ql/src/test/queries/clientpositive/groupby_duplicate_key.q +++ b/ql/src/test/queries/clientpositive/groupby_duplicate_key.q @@ -11,3 +11,7 @@ create table dummy as select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows); select key,dummy1,dummy2 from dummy; + +explain +select max('pants'), max('pANTS') from src group by key limit 1; +select max('pants'), max('pANTS') from src group by key limit 1; diff --git a/ql/src/test/results/clientpositive/case_sensitivity.q.out b/ql/src/test/results/clientpositive/case_sensitivity.q.out index a5b14e8..b3969cc 100644 --- a/ql/src/test/results/clientpositive/case_sensitivity.q.out +++ b/ql/src/test/results/clientpositive/case_sensitivity.q.out @@ -35,7 +35,7 @@ STAGE PLANS: predicate: (lint[0] > 0) (type: boolean) Statistics: Num rows: 3 Data size: 837 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: lint[1] (type: int), lintstring[0].MYSTRING (type: string) + expressions: lint[1] (type: int), lintstring[0].mystring (type: string) outputColumnNames: _col0, _col1 Statistics: Num rows: 3 Data size: 837 Basic stats: COMPLETE Column stats: NONE File Output Operator diff --git a/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out b/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out index fc95f41..8ca8866 100644 --- a/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out +++ b/ql/src/test/results/clientpositive/groupby_duplicate_key.q.out @@ -175,3 +175,74 @@ POSTHOOK: Input: default@dummy 484 X X 86 X X 98 X X +PREHOOK: query: explain +select max('pants'), max('pANTS') from src group by key limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select max('pants'), max('pANTS') from src group by key limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max('pants'), max('pANTS') + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: string), _col2 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), max(VALUE._col1) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select max('pants'), max('pANTS') from src group by key limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max('pants'), max('pANTS') from src group by key limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +pants pANTS diff --git a/ql/src/test/results/clientpositive/tez/schema_evol_orc_acid_mapwork_table.q.out b/ql/src/test/results/clientpositive/tez/schema_evol_orc_acid_mapwork_table.q.out index 0317a99..644ca57 100644 --- a/ql/src/test/results/clientpositive/tez/schema_evol_orc_acid_mapwork_table.q.out +++ b/ql/src/test/results/clientpositive/tez/schema_evol_orc_acid_mapwork_table.q.out @@ -372,7 +372,7 @@ update table5 set c=99 POSTHOOK: type: QUERY POSTHOOK: Input: default@table5 POSTHOOK: Output: default@table5 -row__id a b _c3 d +ROW__ID a b _c3 d PREHOOK: query: select a,b,c,d from table5 PREHOOK: type: QUERY PREHOOK: Input: default@table5 @@ -484,7 +484,7 @@ delete from table6 where a = 2 or a = 4 or a = 6 POSTHOOK: type: QUERY POSTHOOK: Input: default@table6 POSTHOOK: Output: default@table6 -row__id +ROW__ID PREHOOK: query: select a,b,c,d from table6 PREHOOK: type: QUERY PREHOOK: Input: default@table6 @@ -591,7 +591,7 @@ delete from table7 where a = 1 or c = 30 or c == 100 POSTHOOK: type: QUERY POSTHOOK: Input: default@table7 POSTHOOK: Output: default@table7 -row__id +ROW__ID PREHOOK: query: select a,b,c,d from table7 PREHOOK: type: QUERY PREHOOK: Input: default@table7 diff --git a/ql/src/test/results/clientpositive/tez/schema_evol_orc_acidvec_mapwork_table.q.out b/ql/src/test/results/clientpositive/tez/schema_evol_orc_acidvec_mapwork_table.q.out index 3edaff0..63830e2 100644 --- a/ql/src/test/results/clientpositive/tez/schema_evol_orc_acidvec_mapwork_table.q.out +++ b/ql/src/test/results/clientpositive/tez/schema_evol_orc_acidvec_mapwork_table.q.out @@ -372,7 +372,7 @@ update table5 set c=99 POSTHOOK: type: QUERY POSTHOOK: Input: default@table5 POSTHOOK: Output: default@table5 -row__id a b _c3 d +ROW__ID a b _c3 d PREHOOK: query: select a,b,c,d from table5 PREHOOK: type: QUERY PREHOOK: Input: default@table5 @@ -484,7 +484,7 @@ delete from table6 where a = 2 or a = 4 or a = 6 POSTHOOK: type: QUERY POSTHOOK: Input: default@table6 POSTHOOK: Output: default@table6 -row__id +ROW__ID PREHOOK: query: select a,b,c,d from table6 PREHOOK: type: QUERY PREHOOK: Input: default@table6 @@ -591,7 +591,7 @@ delete from table7 where a = 1 or c = 30 or c == 100 POSTHOOK: type: QUERY POSTHOOK: Input: default@table7 POSTHOOK: Output: default@table7 -row__id +ROW__ID PREHOOK: query: select a,b,c,d from table7 PREHOOK: type: QUERY PREHOOK: Input: default@table7 -- 1.7.12.4 (Apple Git-37)