Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1139238) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -21,15 +21,17 @@ import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; +import java.util.Vector; +import java.util.Map.Entry; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -92,7 +94,6 @@ import org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1; import org.apache.hadoop.hive.ql.optimizer.GenMROperator; import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext; -import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink1; import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink2; import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink3; @@ -102,6 +103,7 @@ import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; import org.apache.hadoop.hive.ql.optimizer.MapJoinFactory; import org.apache.hadoop.hive.ql.optimizer.Optimizer; +import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext; import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalOptimizer; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; @@ -122,7 +124,6 @@ import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc; -import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; import org.apache.hadoop.hive.ql.plan.ForwardDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.HiveOperation; @@ -145,12 +146,13 @@ import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.UDTFDesc; import org.apache.hadoop.hive.ql.plan.UnionDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.ResourceType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe; @@ -158,9 +160,9 @@ import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -2137,6 +2139,7 @@ } if (expr.getType() == HiveParser.TOK_ALLCOLREF) { + validateGroupBy(input,qb,expr); pos = genColListRegex(".*", expr.getChildCount() == 0 ? null : getUnescapedName((ASTNode)expr.getChild(0)).toLowerCase(), expr, col_list, inputRR, pos, out_rwsch, qb.getAliases()); @@ -2214,6 +2217,67 @@ return output; } + private void validateGroupBy(Operator input, QB qb, ASTNode astNode) + throws SemanticException { + + String DELIMITER = "_"; + // If the query is not a groupby query. + if (qb.getParseInfo().getDestToGroupBy().isEmpty()) { + return; + } + Vector fieldNameValues = new Vector(); + RowResolver rr = opParseCtx.get(input).getRowResolver(); + List groupByExpr = convertRowSchemaToViewSchema(rr); + Set groupByNames = new HashSet(groupByExpr.size(), 1.0f); + Set tableNames = rr.getTableNames(); + String colName = null; + + // get the column names prefixed with alias. + if (astNode.getChildCount() == 0) { + for (String alias : qb.getTabAliases()) { + Table srcForAlias = qb.getMetaData().getSrcForAlias(alias); + if (null != srcForAlias) { + ArrayList fields = srcForAlias.getFields(); + for (StructField structField : fields) { + fieldNameValues.add(alias + DELIMITER + structField.getFieldName()); + } + } + } + } else { + String alias = astNode.getChild(0).getText(); + Table srcForAlias = qb.getMetaData().getSrcForAlias(alias); + if (null != srcForAlias) { + ArrayList fields = srcForAlias.getFields(); + for (StructField structField : fields) { + fieldNameValues.add(alias + DELIMITER + structField.getFieldName()); + } + } + } + // Alias will not be specified if only one table is there in query. + // In this case, tableNames.size() will be 1 and key will be null. + if (tableNames.size() == 1 && tableNames.iterator().next().equals("")) { + for (String alias : qb.getTabAliases()) { + for (FieldSchema schema : groupByExpr) { + groupByNames.add(alias + DELIMITER + schema.getName()); + } + } + } else { + for (String groupByAlias : tableNames) { + HashMap fieldMap = rr.getFieldMap(groupByAlias); + Collection values = fieldMap.values(); + for (ColumnInfo columnInfo : values) { + colName = rr.reverseLookup(columnInfo.getInternalName())[1]; + groupByNames.add(groupByAlias + DELIMITER + colName); + } + } + } + // validate the column names in select clause with columns specified in group by clause. + for (String fieldNameValue : fieldNameValues) { + if (!groupByNames.contains(fieldNameValue)) { + throw new SemanticException(ErrorMsg.NON_KEY_EXPR_IN_GROUPBY.getMsg(fieldNameValue)); + } + } + } /** * Class to store GenericUDAF related information. */ @@ -2367,8 +2431,30 @@ .getInternalName(), "", false)); String field = getColumnInternalName(i); outputColumnNames.add(field); - groupByOutputRowResolver.putExpression(grpbyExpr, - new ColumnInfo(field, exprInfo.getType(), null, false)); + + //This change is for extracting column name and table name from ASTNode. + //1. if the children length is 3 the group by is with aggregate function and table alias. + //2. if the children length is 2 the group by with table alias name. + //3. if the children length is 1 the group by without any alias. + + if (grpbyExpr.getChildCount() == 2 && grpbyExpr.getType() != HiveParser.TOK_FUNCTION) { + String tableAlias = ""; + if (null != grpbyExpr.getChild(0).getChild(0)) { + tableAlias = grpbyExpr.getChild(0).getChild(0).toString(); + } + groupByOutputRowResolver.put(tableAlias, grpbyExpr.getChild(1) + .toString(), new ColumnInfo(field, exprInfo.getType(), + "", false)); + } else if (grpbyExpr.getChildCount() == 1 && grpbyExpr.getType() != HiveParser.TOK_FUNCTION){ + groupByOutputRowResolver.put("", grpbyExpr.getChild(0) + .toString(), new ColumnInfo(field, exprInfo.getType(), + "", false)); + }else + { + groupByOutputRowResolver.put ( "", grpbyExpr.toStringTree (), new ColumnInfo ( field, + exprInfo.getType (), "", false ) ); + } + colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1)); } // For each aggregation @@ -2897,8 +2983,21 @@ ASTNode grpbyExpr = grpByExprs.get(i); String field = getColumnInternalName(i); outputColumnNames.add(field); - TypeInfo typeInfo = reduceSinkInputRowResolver2.getExpression( - grpbyExpr).getType(); + TypeInfo typeInfo = null; + if (grpbyExpr.getChildCount() == 2 && grpbyExpr.getType() != HiveParser.TOK_FUNCTION) { + String tableAlias = ""; + if (null != grpbyExpr.getChild(0).getChild(0)) { + tableAlias = grpbyExpr.getChild(0).getChild(0).toString(); + } + typeInfo = reduceSinkInputRowResolver2.get ( tableAlias, grpbyExpr.getChild(1) + .toString() ).getType (); + } else if (grpbyExpr.getChildCount() == 1 && grpbyExpr.getType() != HiveParser.TOK_FUNCTION){ + typeInfo = reduceSinkInputRowResolver2.get ( "", grpbyExpr.getChild(0).toString() ).getType (); + }else + { + typeInfo = reduceSinkInputRowResolver2.get ( "", grpbyExpr.toStringTree () ) + .getType (); + } ExprNodeColumnDesc inputExpr = new ExprNodeColumnDesc(typeInfo, field, "", false); reduceKeys.add(inputExpr); Index: ql/src/test/queries/clientnegative/groupby_selectall.q =================================================================== --- ql/src/test/queries/clientnegative/groupby_selectall.q (revision 0) +++ ql/src/test/queries/clientnegative/groupby_selectall.q (revision 0) @@ -0,0 +1,16 @@ +CREATE TABLE dest1(key STRING, c1 INT, c2 STRING) STORED AS TEXTFILE; + +EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +SELECT * FROM dest1 GROUP BY key; Index: ql/src/test/queries/clientpositive/groupby_selectall.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_selectall.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_selectall.q (revision 0) @@ -0,0 +1,16 @@ +CREATE TABLE dest1(key STRING, c1 INT, c2 STRING) STORED AS TEXTFILE; + +EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +SELECT * FROM dest1 GROUP BY key,c1,c2; Index: ql/src/test/results/clientnegative/groupby_selectall.q.out =================================================================== --- ql/src/test/results/clientnegative/groupby_selectall.q.out (revision 0) +++ ql/src/test/results/clientnegative/groupby_selectall.q.out (revision 0) @@ -0,0 +1,269 @@ +PREHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME srcpart) src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL src) ds) '2008-04-08')) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (ds = '2008-04-08') + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(DISTINCT substr(value, 5)) + expr: sum(substr(value, 5)) + bucketGroup: false + keys: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + expr: _col3 + type: double + Needs Tagging: false + Path -> Alias: + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 [src] + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 [src] + Path -> Partition: + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 + Partition + base file name: hr=11 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 11 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306489888 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306489882 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 + Partition + base file name: hr=12 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 12 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306489890 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306489882 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(VALUE._col1) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + directory: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-51-44_277_7930168814170762682/-ext-10000 + NumFilesPerFileSink: 1 + Stats Publishing Key Prefix: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-51-44_277_7930168814170762682/-ext-10000/ + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2 + columns.types string:int:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/dest1 + name default.dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306489903 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true + source: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-51-44_277_7930168814170762682/-ext-10000 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2 + columns.types string:int:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/dest1 + name default.dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306489903 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + tmp directory: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-51-44_277_7930168814170762682/-ext-10001 + + Stage: Stage-2 + Stats-Aggr Operator + Stats Aggregation Key Prefix: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-51-44_277_7930168814170762682/-ext-10000/ + + +PREHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +FAILED: Error in semantic analysis: Expression not in GROUP BY key c1 Index: ql/src/test/results/clientpositive/groupby_selectall.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_selectall.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_selectall.q.out (revision 0) @@ -0,0 +1,289 @@ +PREHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE dest1(key STRING, c1 INT, c2 STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@dest1 +PREHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME srcpart) src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION concat (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1) (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))))) (TOK_WHERE (= (. (TOK_TABLE_OR_COL src) ds) '2008-04-08')) (TOK_GROUPBY (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) key) 1 1)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: (ds = '2008-04-08') + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: key, value + Group By Operator + aggregations: + expr: count(DISTINCT substr(value, 5)) + expr: sum(substr(value, 5)) + bucketGroup: false + keys: + expr: substr(key, 1, 1) + type: string + expr: substr(value, 5) + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + expr: _col3 + type: double + Needs Tagging: false + Path -> Alias: + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 [src] + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 [src] + Path -> Partition: + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 + Partition + base file name: hr=11 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 11 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=11 + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306487297 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306487292 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart + pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 + Partition + base file name: hr=12 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 12 + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart/ds=2008-04-08/hr=12 + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306487299 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.types string:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/srcpart + name default.srcpart + partition_columns ds/hr + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306487292 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: sum(VALUE._col1) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: concat(_col0, _col2) + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToInteger(_col1) + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + directory: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-08-34_354_1414867554455264232/-ext-10000 + NumFilesPerFileSink: 1 + Stats Publishing Key Prefix: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-08-34_354_1414867554455264232/-ext-10000/ + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2 + columns.types string:int:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/dest1 + name default.dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306487314 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true + source: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-08-34_354_1414867554455264232/-ext-10000 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,c1,c2 + columns.types string:int:string + file.inputformat org.apache.hadoop.mapred.TextInputFormat + file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + location pfile:/opensourcetest/23mayTestEnv/build/ql/test/data/warehouse/dest1 + name default.dest1 + serialization.ddl struct dest1 { string key, i32 c1, string c2} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + transient_lastDdlTime 1306487314 + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + tmp directory: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-08-34_354_1414867554455264232/-ext-10001 + + Stage: Stage-2 + Stats-Aggr Operator + Stats Aggregation Key Prefix: pfile:/opensourcetest/23mayTestEnv/build/ql/scratchdir/hive_2011-05-27_02-08-34_354_1414867554455264232/-ext-10000/ + + +PREHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT * FROM dest1 GROUP BY key,c1,c2 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +PREHOOK: Output: file:/tmp/root/hive_2011-05-27_02-08-58_369_4946122424247258194/-mr-10000 +POSTHOOK: query: SELECT * FROM dest1 GROUP BY key,c1,c2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +POSTHOOK: Output: file:/tmp/root/hive_2011-05-27_02-08-58_369_4946122424247258194/-mr-10000 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +0 1 00.0 +1 71 132828.0 +2 69 251142.0 +3 62 364008.0 +4 74 4105526.0 +5 6 5794.0 +6 5 6796.0 +7 6 71470.0 +8 8 81524.0 +9 7 92094.0