diff --git ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java index 8ae1c73..97c7eae 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java +++ ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql; import org.antlr.runtime.tree.Tree; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.parse.ASTNode; import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; @@ -29,12 +30,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.antlr.runtime.tree.Tree; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.metadata.HiveUtils; -import org.apache.hadoop.hive.ql.parse.ASTNode; -import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; - /** * List of all error messages. * This list contains both compile time and run-time errors. @@ -80,7 +75,6 @@ INVALID_JOIN_CONDITION_2(10018, "Neither left nor right aliases encountered in JOIN"), INVALID_JOIN_CONDITION_3(10019, "OR not supported in JOIN currently"), INVALID_TRANSFORM(10020, "TRANSFORM with other SELECT columns not supported"), - DUPLICATE_GROUPBY_KEY(10021, "Repeated key in GROUP BY"), UNSUPPORTED_MULTIPLE_DISTINCTS(10022, "DISTINCT on different columns not supported" + " with skew in data"), NO_SUBQUERY_ALIAS(10023, "No alias for subquery"), diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index cb284d7..f964933 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -62,6 +62,7 @@ import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator; import org.apache.hadoop.hive.ql.exec.ArchiveUtils; import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.FunctionInfo; @@ -4131,6 +4132,8 @@ private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap); + int keyLength = reduceKeys.size(); + // add a key for reduce sink if (groupingSetsPresent) { // Process grouping set for the reduce sink operator @@ -4182,7 +4185,7 @@ private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap( OperatorFactory.getAndMakeChild( PlanUtils.getReduceSinkDesc(reduceKeys, - groupingSetsPresent ? grpByExprs.size() + 1 : grpByExprs.size(), + groupingSetsPresent ? keyLength + 1 : keyLength, reduceValues, distinctColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, numPartitionFields, numReducers), @@ -4203,24 +4206,32 @@ private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, ASTNode grpbyExpr = grpByExprs.get(i); ExprNodeDesc inputExpr = genExprNodeDesc(grpbyExpr, reduceSinkInputRowResolver); - reduceKeys.add(inputExpr); - if (reduceSinkOutputRowResolver.getExpression(grpbyExpr) == null) { - outputKeyColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); - String field = Utilities.ReduceField.KEY.toString() + "." - + getColumnInternalName(reduceKeys.size() - 1); - ColumnInfo colInfo = new ColumnInfo(field, reduceKeys.get( - reduceKeys.size() - 1).getTypeInfo(), null, false); - reduceSinkOutputRowResolver.putExpression(grpbyExpr, colInfo); - colExprMap.put(colInfo.getInternalName(), inputExpr); - } else { - throw new SemanticException(ErrorMsg.DUPLICATE_GROUPBY_KEY - .getMsg(grpbyExpr)); + ColumnInfo prev = reduceSinkOutputRowResolver.getExpression(grpbyExpr); + if (prev != null && isDeterministic(inputExpr)) { + colExprMap.put(prev.getInternalName(), inputExpr); + continue; } + reduceKeys.add(inputExpr); + outputKeyColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); + String field = Utilities.ReduceField.KEY.toString() + "." + + getColumnInternalName(reduceKeys.size() - 1); + ColumnInfo colInfo = new ColumnInfo(field, reduceKeys.get( + reduceKeys.size() - 1).getTypeInfo(), null, false); + reduceSinkOutputRowResolver.putExpression(grpbyExpr, colInfo); + colExprMap.put(colInfo.getInternalName(), inputExpr); } return reduceKeys; } + private boolean isDeterministic(ExprNodeDesc expr) throws SemanticException { + try { + return ExprNodeEvaluatorFactory.get(expr).isDeterministic(); + } catch (Exception e) { + throw new SemanticException(e); + } + } + private List> getDistinctColIndicesForReduceSink(QBParseInfo parseInfo, String dest, List reduceKeys, RowResolver reduceSinkInputRowResolver, @@ -4326,6 +4337,8 @@ private ReduceSinkOperator genCommonGroupByPlanReduceSinkOperator(QB qb, List> distinctColIndices = getDistinctColIndicesForReduceSink(parseInfo, dest, reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap); @@ -4373,8 +4386,8 @@ private ReduceSinkOperator genCommonGroupByPlanReduceSinkOperator(QB qb, List outputKeyColumnNames, List outputValueColumnNames, boolean includeKey, int tag, int numPartitionFields, int numReducers) throws SemanticException { - ArrayList partitionCols = null; + ArrayList partitionCols = new ArrayList(); if (numPartitionFields >= keyCols.size()) { - partitionCols = keyCols; + partitionCols.addAll(keyCols); } else if (numPartitionFields >= 0) { - partitionCols = new ArrayList(numPartitionFields); - for (int i = 0; i < numPartitionFields; i++) { - partitionCols.add(keyCols.get(i)); - } + partitionCols.addAll(keyCols.subList(0, numPartitionFields)); } else { // numPartitionFields = -1 means random partitioning - partitionCols = new ArrayList(1); - partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor - .getFuncExprNodeDesc("rand")); + partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand")); } StringBuilder order = new StringBuilder(); diff --git ql/src/test/queries/clientpositive/groupby_duplicate_key.q ql/src/test/queries/clientpositive/groupby_duplicate_key.q new file mode 100644 index 0000000..7f38efe --- /dev/null +++ ql/src/test/queries/clientpositive/groupby_duplicate_key.q @@ -0,0 +1,13 @@ +explain +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows); + +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows); + +explain +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows); + +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows); + +select key,dummy1,dummy2 from dummy; diff --git ql/src/test/results/clientpositive/groupby_duplicate_key.q.out ql/src/test/results/clientpositive/groupby_duplicate_key.q.out new file mode 100644 index 0000000..e37b2d4 --- /dev/null +++ ql/src/test/results/clientpositive/groupby_duplicate_key.q.out @@ -0,0 +1,173 @@ +PREHOOK: query: explain +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Row Limit Per Split: 10 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string), '' (type: string), '' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col2 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col2 (type: string) + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +165 +238 +255 +27 +278 +311 +409 +484 +86 +98 +PREHOOK: query: explain +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +PREHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: query: explain +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: CREATETABLE_AS_SELECT +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-0 + Stage-2 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Row Limit Per Split: 10 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string), 'X' (type: string), 'X' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col2 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col2 (type: string) + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dummy + + Stage: Stage-0 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-3 + Create Table Operator: + Create Table + columns: key string, dummy1 string, dummy2 string + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat + name: dummy + + Stage: Stage-2 + Stats-Aggr Operator + +PREHOOK: query: create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +POSTHOOK: query: create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dummy +PREHOOK: query: select key,dummy1,dummy2 from dummy +PREHOOK: type: QUERY +PREHOOK: Input: default@dummy +#### A masked pattern was here #### +POSTHOOK: query: select key,dummy1,dummy2 from dummy +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dummy +#### A masked pattern was here #### +165 X X +238 X X +255 X X +27 X X +278 X X +311 X X +409 X X +484 X X +86 X X +98 X X diff --git ql/src/test/results/compiler/plan/groupby1.q.xml ql/src/test/results/compiler/plan/groupby1.q.xml index af100ed..1f53052 100755 --- ql/src/test/results/compiler/plan/groupby1.q.xml +++ ql/src/test/results/compiler/plan/groupby1.q.xml @@ -363,7 +363,7 @@ - + @@ -423,7 +423,11 @@ - + + + + + -1 @@ -615,7 +619,7 @@ - + 0.9 @@ -1328,7 +1332,7 @@ - + 0.9 diff --git ql/src/test/results/compiler/plan/groupby4.q.xml ql/src/test/results/compiler/plan/groupby4.q.xml index 1822733..56fc265 100644 --- ql/src/test/results/compiler/plan/groupby4.q.xml +++ ql/src/test/results/compiler/plan/groupby4.q.xml @@ -197,7 +197,7 @@ - + @@ -253,7 +253,11 @@ - + + + + + -1 @@ -405,7 +409,7 @@ - + 0.9 @@ -1009,7 +1013,7 @@ - + 0.9 diff --git ql/src/test/results/compiler/plan/groupby5.q.xml ql/src/test/results/compiler/plan/groupby5.q.xml index 0bfc684..7a63fd1 100644 --- ql/src/test/results/compiler/plan/groupby5.q.xml +++ ql/src/test/results/compiler/plan/groupby5.q.xml @@ -215,7 +215,7 @@ - + @@ -275,7 +275,11 @@ - + + + + + -1 @@ -467,7 +471,7 @@ - + 0.9 @@ -1203,7 +1207,7 @@ - + 0.9 diff --git ql/src/test/results/compiler/plan/groupby6.q.xml ql/src/test/results/compiler/plan/groupby6.q.xml index 5b3696c..2cc7442 100644 --- ql/src/test/results/compiler/plan/groupby6.q.xml +++ ql/src/test/results/compiler/plan/groupby6.q.xml @@ -197,7 +197,7 @@ - + @@ -253,7 +253,11 @@ - + + + + + -1 @@ -405,7 +409,7 @@ - + 0.9 @@ -1009,7 +1013,7 @@ - + 0.9 diff --git ql/src/test/results/compiler/plan/join1.q.xml ql/src/test/results/compiler/plan/join1.q.xml index e88d5dd..0bd33c4 100644 --- ql/src/test/results/compiler/plan/join1.q.xml +++ ql/src/test/results/compiler/plan/join1.q.xml @@ -421,7 +421,7 @@ - + @@ -481,7 +481,11 @@ - + + + + + 1 @@ -774,7 +778,7 @@ - + @@ -830,7 +834,11 @@ - + + + + + diff --git ql/src/test/results/compiler/plan/join2.q.xml ql/src/test/results/compiler/plan/join2.q.xml index 11c44c7..75f1404 100644 --- ql/src/test/results/compiler/plan/join2.q.xml +++ ql/src/test/results/compiler/plan/join2.q.xml @@ -399,7 +399,7 @@ - + @@ -459,7 +459,11 @@ - + + + + + @@ -546,7 +550,7 @@ - + _col0 @@ -561,7 +565,7 @@ TS_19 - + @@ -671,7 +675,7 @@ - + @@ -731,7 +735,11 @@ - + + + + + 1 @@ -1815,7 +1823,7 @@ - + @@ -1871,7 +1879,11 @@ - + + + + + 1 @@ -2121,7 +2133,7 @@ - + @@ -2177,7 +2189,11 @@ - + + + + + diff --git ql/src/test/results/compiler/plan/join3.q.xml ql/src/test/results/compiler/plan/join3.q.xml index 6fde4e0..5276850 100644 --- ql/src/test/results/compiler/plan/join3.q.xml +++ ql/src/test/results/compiler/plan/join3.q.xml @@ -472,7 +472,7 @@ - + @@ -528,7 +528,11 @@ - + + + + + 1 @@ -800,7 +804,7 @@ - + @@ -860,7 +864,11 @@ - + + + + + 2 @@ -1145,7 +1153,7 @@ - + @@ -1201,7 +1209,11 @@ - + + + + + diff --git ql/src/test/results/compiler/plan/join4.q.xml ql/src/test/results/compiler/plan/join4.q.xml index 22a4911..6d3739e 100644 --- ql/src/test/results/compiler/plan/join4.q.xml +++ ql/src/test/results/compiler/plan/join4.q.xml @@ -270,7 +270,7 @@ - + @@ -330,7 +330,11 @@ - + + + + + @@ -795,7 +799,7 @@ - + @@ -855,7 +859,11 @@ - + + + + + 1 diff --git ql/src/test/results/compiler/plan/join5.q.xml ql/src/test/results/compiler/plan/join5.q.xml index 5033366..6d3ce7c 100644 --- ql/src/test/results/compiler/plan/join5.q.xml +++ ql/src/test/results/compiler/plan/join5.q.xml @@ -270,7 +270,7 @@ - + @@ -330,7 +330,11 @@ - + + + + + @@ -795,7 +799,7 @@ - + @@ -855,7 +859,11 @@ - + + + + + 1 diff --git ql/src/test/results/compiler/plan/join6.q.xml ql/src/test/results/compiler/plan/join6.q.xml index b1185a9..f8a7670 100644 --- ql/src/test/results/compiler/plan/join6.q.xml +++ ql/src/test/results/compiler/plan/join6.q.xml @@ -270,7 +270,7 @@ - + @@ -330,7 +330,11 @@ - + + + + + @@ -795,7 +799,7 @@ - + @@ -855,7 +859,11 @@ - + + + + + 1 diff --git ql/src/test/results/compiler/plan/join7.q.xml ql/src/test/results/compiler/plan/join7.q.xml index a1ab3e6..2b317ae 100644 --- ql/src/test/results/compiler/plan/join7.q.xml +++ ql/src/test/results/compiler/plan/join7.q.xml @@ -335,7 +335,7 @@ - + @@ -395,7 +395,11 @@ - + + + + + @@ -860,7 +864,7 @@ - + @@ -920,7 +924,11 @@ - + + + + + 1 @@ -1376,7 +1384,7 @@ - + @@ -1436,7 +1444,11 @@ - + + + + + 2 diff --git ql/src/test/results/compiler/plan/join8.q.xml ql/src/test/results/compiler/plan/join8.q.xml index ba128d4..b74c543 100644 --- ql/src/test/results/compiler/plan/join8.q.xml +++ ql/src/test/results/compiler/plan/join8.q.xml @@ -270,7 +270,7 @@ - + @@ -330,7 +330,11 @@ - + + + + + @@ -836,7 +840,7 @@ - + @@ -896,7 +900,11 @@ - + + + + + 1