diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index 216c361..69865c8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -314,6 +314,7 @@ TOK_SUBQUERY_OP_NOTIN; TOK_SUBQUERY_OP_NOTEXISTS; TOK_DB_TYPE; TOK_TABLE_TYPE; +TOK_CTE; } @@ -1888,10 +1889,36 @@ setOperator queryStatementExpression[boolean topLevel] : + /* Would be nice to do this as a gated semantic perdicate + But the predicate gets pushed as a lookahead decision. + Calling rule doesnot know about topLevel + */ + (w=withClause {topLevel}?)? + queryStatementExpressionBody[topLevel] { + if ($w.tree != null) { + adaptor.addChild($queryStatementExpressionBody.tree, $w.tree); + } + } + -> queryStatementExpressionBody + ; + +queryStatementExpressionBody[boolean topLevel] + : fromStatement[topLevel] | regularBody[topLevel] ; +withClause + : + KW_WITH cteStatement+ -> ^(TOK_CTE cteStatement+) +; + +cteStatement + : + identifier KW_AS LPAREN queryStatementExpression[false] RPAREN + -> ^(TOK_SUBQUERY queryStatementExpression identifier) +; + fromStatement[boolean topLevel] : (singleFromStatement -> singleFromStatement) (u=setOperator r=singleFromStatement diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java index 3fbe8e2..13252f7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/QB.java @@ -218,6 +218,10 @@ public void rewriteViewToSubq(String alias, String viewName, QBExpr qbexpr) { assert (viewName.equals(tableName)); aliasToSubq.put(alias, qbexpr); } + + public void rewriteCTEToSubq(String alias, String cteName, QBExpr qbexpr) { + rewriteViewToSubq(alias, cteName, qbexpr); + } public QBJoinTree getQbJoinTree() { return qbjoin; diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 1f7aae0..c70f0a3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -240,6 +240,15 @@ //flag for partial scan during analyze ... compute statistics protected boolean partialscan = false; + + /* + * Capture the CTE definitions in a Query. + */ + private final Map aliasToCTEs; + /* + * Used to check recursive CTE invocations. Similar to viewsExpanded + */ + private ArrayList ctesExpanded; private static class Phase1Ctx { String dest; @@ -276,6 +285,7 @@ public SemanticAnalyzer(HiveConf conf) throws SemanticException { HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_INCLUDEFUNCNAME); queryProperties = new QueryProperties(); opToPartToSkewedPruner = new HashMap>(); + aliasToCTEs = new HashMap(); } @Override @@ -295,6 +305,7 @@ protected void reset() { opParseCtx.clear(); groupOpToInputTables.clear(); prunedPartitions.clear(); + aliasToCTEs.clear(); } public void initParseCtx(ParseContext pctx) { @@ -656,6 +667,53 @@ private String processSubQuery(QB qb, ASTNode subq) throws SemanticException { return alias; } + + /* + * Phase1: hold onto any CTE definitions in aliasToCTE. + * CTE definitions are global to the Query. + */ + private void processCTE(QB qb, ASTNode ctes) throws SemanticException { + + int numCTEs = ctes.getChildCount(); + + for(int i=0; i > aliasToViewInfo = new HashMap>(); + + /* + * used to capture view to SQ conversions. This is used to check for + * recursive CTE invocations. + */ + Map sqAliasToCTEName = new HashMap(); + for (String alias : tabAliases) { String tab_name = qb.getTabNameForAlias(alias); Table tab = null; try { tab = db.getTable(tab_name); } catch (InvalidTableException ite) { + /* + * if this s a CTE reference: + * Add its AST as a SubQuery to this QB. + */ + ASTNode cteNode = aliasToCTEs.get(tab_name.toLowerCase()); + if ( cteNode != null ) { + String cte_name = tab_name.toLowerCase(); + if (ctesExpanded.contains(cte_name)) { + throw new SemanticException("Recursive cte " + tab_name + + " detected (cycle: " + StringUtils.join(ctesExpanded, " -> ") + + " -> " + tab_name + ")."); + } + addCTEAsSubQuery(qb, cte_name, alias); + sqAliasToCTEName.put(alias, cte_name); + continue; + } throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(qb .getParseInfo().getSrcForAlias(alias))); } @@ -1182,15 +1266,20 @@ public void getMetaData(QB qb, ReadEntity parentInput) throws SemanticException // Go over the subqueries and getMetaData for these for (String alias : qb.getSubqAliases()) { boolean wasView = aliasToViewInfo.containsKey(alias); + boolean wasCTE = sqAliasToCTEName.containsKey(alias); ReadEntity newParentInput = null; if (wasView) { viewsExpanded.add(aliasToViewInfo.get(alias).getFirst()); newParentInput = aliasToViewInfo.get(alias).getSecond(); + } else if (wasCTE ) { + ctesExpanded.add(sqAliasToCTEName.get(alias)); } QBExpr qbexpr = qb.getSubqForAlias(alias); getMetaData(qbexpr, newParentInput); if (wasView) { viewsExpanded.remove(viewsExpanded.size() - 1); + } else if ( wasCTE ) { + ctesExpanded.remove(ctesExpanded.size() - 1); } } @@ -8857,6 +8946,7 @@ public void analyzeInternal(ASTNode ast) throws SemanticException { ASTNode child = ast; this.ast = ast; viewsExpanded = new ArrayList(); + ctesExpanded = new ArrayList(); LOG.info("Starting Semantic Analysis"); diff --git ql/src/test/queries/clientnegative/cte_recursion.q ql/src/test/queries/clientnegative/cte_recursion.q new file mode 100644 index 0000000..f9c1405 --- /dev/null +++ ql/src/test/queries/clientnegative/cte_recursion.q @@ -0,0 +1,4 @@ +explain +with q1 as ( select key from q2 where key = '5') +q2 as ( select key from q1 where key = '5') +select * from (select key from q1) a; \ No newline at end of file diff --git ql/src/test/queries/clientnegative/cte_with_in_subquery.q ql/src/test/queries/clientnegative/cte_with_in_subquery.q new file mode 100644 index 0000000..e52a1d9 --- /dev/null +++ ql/src/test/queries/clientnegative/cte_with_in_subquery.q @@ -0,0 +1 @@ +select * from (with q1 as ( select key from q2 where key = '5') select * from q1) a; diff --git ql/src/test/queries/clientpositive/cte_1.q ql/src/test/queries/clientpositive/cte_1.q new file mode 100644 index 0000000..cfd8238 --- /dev/null +++ ql/src/test/queries/clientpositive/cte_1.q @@ -0,0 +1,28 @@ +explain +with q1 as ( select key from src where key = '5') +select * +from q1 +; + +with q1 as ( select key from src where key = '5') +select * +from q1 +; + +-- in subquery +explain +with q1 as ( select key from src where key = '5') +select * from (select key from q1) a; + +with q1 as ( select key from src where key = '5') +select * from (select key from q1) a; + +-- chaining +explain +with q1 as ( select key from q2 where key = '5') +q2 as ( select key from src where key = '5') +select * from (select key from q1) a; + +with q1 as ( select key from q2 where key = '5') +q2 as ( select key from src where key = '5') +select * from (select key from q1) a; \ No newline at end of file diff --git ql/src/test/results/clientnegative/cte_recursion.q.out ql/src/test/results/clientnegative/cte_recursion.q.out new file mode 100644 index 0000000..10aed92 --- /dev/null +++ ql/src/test/results/clientnegative/cte_recursion.q.out @@ -0,0 +1 @@ +FAILED: SemanticException Recursive cte q1 detected (cycle: q1 -> q2 -> q1). diff --git ql/src/test/results/clientnegative/cte_with_in_subquery.q.out ql/src/test/results/clientnegative/cte_with_in_subquery.q.out new file mode 100644 index 0000000..fa22b48 --- /dev/null +++ ql/src/test/results/clientnegative/cte_with_in_subquery.q.out @@ -0,0 +1 @@ +FAILED: ParseException line 1:64 Failed to recognize predicate 'select'. Failed rule: 'queryStatementExpression' in subquery source diff --git ql/src/test/results/clientpositive/cte_1.q.out ql/src/test/results/clientpositive/cte_1.q.out new file mode 100644 index 0000000..6d069bf --- /dev/null +++ ql/src/test/results/clientpositive/cte_1.q.out @@ -0,0 +1,180 @@ +PREHOOK: query: explain +with q1 as ( select key from src where key = '5') +select * +from q1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +with q1 as ( select key from src where key = '5') +select * +from q1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME q1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))) (TOK_CTE (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) '5')))) q1))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + q1:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = '5') + type: boolean + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: with q1 as ( select key from src where key = '5') +select * +from q1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: with q1 as ( select key from src where key = '5') +select * +from q1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +5 +5 +5 +PREHOOK: query: -- in subquery +explain +with q1 as ( select key from src where key = '5') +select * from (select key from q1) a +PREHOOK: type: QUERY +POSTHOOK: query: -- in subquery +explain +with q1 as ( select key from src where key = '5') +select * from (select key from q1) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME q1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))) (TOK_CTE (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) '5')))) q1))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:q1:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = '5') + type: boolean + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: with q1 as ( select key from src where key = '5') +select * from (select key from q1) a +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: with q1 as ( select key from src where key = '5') +select * from (select key from q1) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +5 +5 +5 +PREHOOK: query: -- chaining +explain +with q1 as ( select key from q2 where key = '5') +q2 as ( select key from src where key = '5') +select * from (select key from q1) a +PREHOOK: type: QUERY +POSTHOOK: query: -- chaining +explain +with q1 as ( select key from q2 where key = '5') +q2 as ( select key from src where key = '5') +select * from (select key from q1) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME q1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))) (TOK_CTE (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME q2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) '5')))) q1) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) '5')))) q2))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:q1:q2:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = '5') + type: boolean + Select Operator + expressions: + expr: key + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: with q1 as ( select key from q2 where key = '5') +q2 as ( select key from src where key = '5') +select * from (select key from q1) a +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: with q1 as ( select key from q2 where key = '5') +q2 as ( select key from src where key = '5') +select * from (select key from q1) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +5 +5 +5