diff --git ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java index 8ae1c73..97c7eae 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java +++ ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql; import org.antlr.runtime.tree.Tree; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.parse.ASTNode; import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; @@ -29,12 +30,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.antlr.runtime.tree.Tree; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.metadata.HiveUtils; -import org.apache.hadoop.hive.ql.parse.ASTNode; -import org.apache.hadoop.hive.ql.parse.ASTNodeOrigin; - /** * List of all error messages. * This list contains both compile time and run-time errors. @@ -80,7 +75,6 @@ INVALID_JOIN_CONDITION_2(10018, "Neither left nor right aliases encountered in JOIN"), INVALID_JOIN_CONDITION_3(10019, "OR not supported in JOIN currently"), INVALID_TRANSFORM(10020, "TRANSFORM with other SELECT columns not supported"), - DUPLICATE_GROUPBY_KEY(10021, "Repeated key in GROUP BY"), UNSUPPORTED_MULTIPLE_DISTINCTS(10022, "DISTINCT on different columns not supported" + " with skew in data"), NO_SUBQUERY_ALIAS(10023, "No alias for subquery"), diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 6cdaedb..b898d3e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -62,6 +62,7 @@ import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator; import org.apache.hadoop.hive.ql.exec.ArchiveUtils; import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.FunctionInfo; @@ -4080,6 +4081,8 @@ private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap); + int keyLength = reduceKeys.size(); + // add a key for reduce sink if (groupingSetsPresent) { // Process grouping set for the reduce sink operator @@ -4131,7 +4134,7 @@ private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap( OperatorFactory.getAndMakeChild( PlanUtils.getReduceSinkDesc(reduceKeys, - groupingSetsPresent ? grpByExprs.size() + 1 : grpByExprs.size(), + groupingSetsPresent ? keyLength + 1 : keyLength, reduceValues, distinctColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, numPartitionFields, numReducers), @@ -4152,24 +4155,32 @@ private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, ASTNode grpbyExpr = grpByExprs.get(i); ExprNodeDesc inputExpr = genExprNodeDesc(grpbyExpr, reduceSinkInputRowResolver); - reduceKeys.add(inputExpr); - if (reduceSinkOutputRowResolver.getExpression(grpbyExpr) == null) { - outputKeyColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); - String field = Utilities.ReduceField.KEY.toString() + "." - + getColumnInternalName(reduceKeys.size() - 1); - ColumnInfo colInfo = new ColumnInfo(field, reduceKeys.get( - reduceKeys.size() - 1).getTypeInfo(), null, false); - reduceSinkOutputRowResolver.putExpression(grpbyExpr, colInfo); - colExprMap.put(colInfo.getInternalName(), inputExpr); - } else { - throw new SemanticException(ErrorMsg.DUPLICATE_GROUPBY_KEY - .getMsg(grpbyExpr)); + ColumnInfo prev = reduceSinkOutputRowResolver.getExpression(grpbyExpr); + if (prev != null && isDeterministic(inputExpr)) { + colExprMap.put(prev.getInternalName(), inputExpr); + continue; } + reduceKeys.add(inputExpr); + outputKeyColumnNames.add(getColumnInternalName(reduceKeys.size() - 1)); + String field = Utilities.ReduceField.KEY.toString() + "." + + getColumnInternalName(reduceKeys.size() - 1); + ColumnInfo colInfo = new ColumnInfo(field, reduceKeys.get( + reduceKeys.size() - 1).getTypeInfo(), null, false); + reduceSinkOutputRowResolver.putExpression(grpbyExpr, colInfo); + colExprMap.put(colInfo.getInternalName(), inputExpr); } return reduceKeys; } + private boolean isDeterministic(ExprNodeDesc expr) throws SemanticException { + try { + return ExprNodeEvaluatorFactory.get(expr).isDeterministic(); + } catch (Exception e) { + throw new SemanticException(e); + } + } + private List> getDistinctColIndicesForReduceSink(QBParseInfo parseInfo, String dest, List reduceKeys, RowResolver reduceSinkInputRowResolver, @@ -4275,6 +4286,8 @@ private ReduceSinkOperator genCommonGroupByPlanReduceSinkOperator(QB qb, List> distinctColIndices = getDistinctColIndicesForReduceSink(parseInfo, dest, reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap); @@ -4322,8 +4335,8 @@ private ReduceSinkOperator genCommonGroupByPlanReduceSinkOperator(QB qb, List outputKeyColumnNames, List outputValueColumnNames, boolean includeKey, int tag, int numPartitionFields, int numReducers) throws SemanticException { - ArrayList partitionCols = null; + ArrayList partitionCols = new ArrayList(); if (numPartitionFields >= keyCols.size()) { - partitionCols = keyCols; + partitionCols.addAll(keyCols); } else if (numPartitionFields >= 0) { - partitionCols = new ArrayList(numPartitionFields); - for (int i = 0; i < numPartitionFields; i++) { - partitionCols.add(keyCols.get(i)); - } + partitionCols.addAll(keyCols.subList(0, numPartitionFields)); } else { // numPartitionFields = -1 means random partitioning - partitionCols = new ArrayList(1); - partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor - .getFuncExprNodeDesc("rand")); + partitionCols.add(TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand")); } StringBuilder order = new StringBuilder(); diff --git ql/src/test/queries/clientpositive/groupby_duplicate_key.q ql/src/test/queries/clientpositive/groupby_duplicate_key.q new file mode 100644 index 0000000..7f38efe --- /dev/null +++ ql/src/test/queries/clientpositive/groupby_duplicate_key.q @@ -0,0 +1,13 @@ +explain +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows); + +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows); + +explain +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows); + +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows); + +select key,dummy1,dummy2 from dummy; diff --git ql/src/test/results/clientpositive/groupby_duplicate_key.q.out ql/src/test/results/clientpositive/groupby_duplicate_key.q.out new file mode 100644 index 0000000..e37b2d4 --- /dev/null +++ ql/src/test/results/clientpositive/groupby_duplicate_key.q.out @@ -0,0 +1,173 @@ +PREHOOK: query: explain +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +PREHOOK: type: QUERY +POSTHOOK: query: explain +select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Row Limit Per Split: 10 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string), '' (type: string), '' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col2 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col2 (type: string) + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select distinct key, "" as dummy1, "" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +165 +238 +255 +27 +278 +311 +409 +484 +86 +98 +PREHOOK: query: explain +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +PREHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: query: explain +create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: CREATETABLE_AS_SELECT +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-0 + Stage-2 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Row Limit Per Split: 10 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string), 'X' (type: string), 'X' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col2 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col2 (type: string) + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dummy + + Stage: Stage-0 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-3 + Create Table Operator: + Create Table + columns: key string, dummy1 string, dummy2 string + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat + name: dummy + + Stage: Stage-2 + Stats-Aggr Operator + +PREHOOK: query: create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +POSTHOOK: query: create table dummy as +select distinct key, "X" as dummy1, "X" as dummy2 from src tablesample (10 rows) +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dummy +PREHOOK: query: select key,dummy1,dummy2 from dummy +PREHOOK: type: QUERY +PREHOOK: Input: default@dummy +#### A masked pattern was here #### +POSTHOOK: query: select key,dummy1,dummy2 from dummy +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dummy +#### A masked pattern was here #### +165 X X +238 X X +255 X X +27 X X +278 X X +311 X X +409 X X +484 X X +86 X X +98 X X