Index: build.properties =================================================================== --- build.properties (revision 1520834) +++ build.properties (working copy) @@ -16,7 +16,7 @@ Name=Hive name=hive -version=0.13.0-SNAPSHOT +version=0.12.0-SNAPSHOT hcatalog.version=${version} year=2012 Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 1520834) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -133,9 +133,9 @@ import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.GenericUDFLag; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.GenericUDFLead; -import org.apache.hadoop.hive.ql.udf.ptf.NPath.NPathResolver; import org.apache.hadoop.hive.ql.udf.ptf.Noop.NoopResolver; import org.apache.hadoop.hive.ql.udf.ptf.NoopWithMap.NoopWithMapResolver; +import org.apache.hadoop.hive.ql.udf.ptf.RegexPath.RegexPathResolver; import org.apache.hadoop.hive.ql.udf.ptf.TableFunctionResolver; import org.apache.hadoop.hive.ql.udf.ptf.WindowingTableFunction.WindowingTableFunctionResolver; import org.apache.hadoop.hive.ql.udf.xml.GenericUDFXPath; @@ -454,7 +454,8 @@ registerTableFunction(NOOP_TABLE_FUNCTION, NoopResolver.class); registerTableFunction(NOOP_MAP_TABLE_FUNCTION, NoopWithMapResolver.class); registerTableFunction(WINDOWING_TABLE_FUNCTION, WindowingTableFunctionResolver.class); - registerTableFunction("npath", NPathResolver.class); + registerTableFunction("regex_path", RegexPathResolver.class); + } public static void registerTemporaryUDF(String functionName, Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java (revision 1520834) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java (working copy) @@ -208,9 +208,9 @@ /* * parse a String as a Select List. This allows table functions to be passed expression Strings * that are translated in - * the context they define at invocation time. Currently used by NPath to allow users to specify + * the context they define at invocation time. Currently used by RegexPath to allow users to specify * what output they want. - * NPath allows expressions n 'tpath' a column that represents the matched set of rows. This + * RegexPath allows expressions n 'tpath' a column that represents the matched set of rows. This * column doesn't exist in * the input schema and hence the Result Expression cannot be analyzed by the regular Hive * translation process. Index: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/RegexPath.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/RegexPath.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/RegexPath.java (working copy) @@ -0,0 +1,918 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.ptf; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; +import org.apache.hadoop.hive.ql.exec.PTFPartition; +import org.apache.hadoop.hive.ql.exec.PTFPartition.PTFPartitionIterator; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.PTFTranslator; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.TypeCheckCtx; +import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowExpressionSpec; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.PTFDesc; +import org.apache.hadoop.hive.ql.plan.PTFDesc.PTFExpressionDef; +import org.apache.hadoop.hive.ql.plan.PTFDesc.PTFInputDef; +import org.apache.hadoop.hive.ql.plan.PTFDesc.PartitionedTableFunctionDef; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * return rows that meet a specified pattern. Use symbols to specify a list of expressions + * to match. + * Pattern is used to specify a Path. The results list can contain expressions based on + * the input columns and also the matched Path. + *
    + *
  1. pattern: pattern for the Path. Path is 'dot' separated list of symbols. + * Each element is treated as a symbol. Elements that end in '*' or '+' are interpreted with + * the usual meaning of zero or more, one or more respectively. For e.g. + * "LATE.EARLY*.ONTIMEOREARLY" implies a sequence of flights + * where the first occurrence was LATE, followed by zero or more EARLY flights, + * followed by a ONTIME or EARLY flight. + *
  2. symbols specify a list of name, expression pairs. For e.g. + * 'LATE', arrival_delay > 0, 'EARLY', arrival_delay < 0 , 'ONTIME', arrival_delay == 0. + * These symbols can be used in the Pattern defined above. + *
  3. resultSelectList specified as a select list. + * The expressions in the selectList are evaluated in the context where all the + * input columns are available, plus the attribute + * "tpath" is available. Path is a collection of rows that represents the matching Path. + *
+ */ +public class RegexPath extends TableFunctionEvaluator { + private transient String patternStr; + private transient SymbolsInfo symInfo; + private transient String resultExprStr; + private transient SymbolFunction syFn; + private ResultExprInfo resultExprInfo; + /* + * the names of the Columns of the input to RegexPath. Used to setup the tpath Struct column. + */ + private HashMap inputColumnNamesMap; + + @Override + public void execute(PTFPartitionIterator pItr, PTFPartition outP) throws HiveException + { + while (pItr.hasNext()) + { + Object iRow = pItr.next(); + + SymbolFunctionResult syFnRes = SymbolFunction.match(syFn, iRow, pItr); + if (syFnRes.matches ) + { + int sz = syFnRes.nextRow - (pItr.getIndex() - 1); + Object selectListInput = RegexPath.getSelectListInput(iRow, + tDef.getInput().getOutputShape().getOI(), pItr, sz); + ArrayList oRow = new ArrayList(); + for(ExprNodeEvaluator resExprEval : resultExprInfo.resultExprEvals) + { + oRow.add(resExprEval.evaluate(selectListInput)); + } + outP.append(oRow); + } + } + } + + static void throwErrorWithSignature(String message) throws SemanticException + { + throw new SemanticException(String.format( + "RegexPath signature is: SymbolPattern, one or more SymbolName, " + + "expression pairs, the result expression as a select list. Error %s", + message)); + } + + public HashMap getInputColumnNames() { + return inputColumnNamesMap; + } + + public void setInputColumnNames(HashMap inputColumnNamesMap) { + this.inputColumnNamesMap = inputColumnNamesMap; + } + + public static class RegexPathResolver extends TableFunctionResolver + { + + @Override + protected TableFunctionEvaluator createEvaluator(PTFDesc ptfDesc, + PartitionedTableFunctionDef tDef) + { + + return new RegexPath(); + } + + /** + *
    + *
  • check structure of Arguments: + *
      + *
    1. First arg should be a String + *
    2. then there should be an even number of Arguments: + * String, expression; expression should be Convertible to Boolean. + *
    3. finally there should be a String. + *
    + *
  • convert pattern into a NNode chain. + *
  • convert symbol args into a Symbol Map. + *
  • parse selectList into SelectList struct. The inputOI used to translate + * these expressions should be based on the + * columns in the Input, the 'path.attr' + *
+ */ + @Override + public void setupOutputOI() throws SemanticException + { + RegexPath evaluator = (RegexPath) getEvaluator(); + PartitionedTableFunctionDef tDef = evaluator.getTableDef(); + + ArrayList args = tDef.getArgs(); + int argsNum = args == null ? 0 : args.size(); + + if ( argsNum < 4 ) + { + throwErrorWithSignature("at least 4 arguments required"); + } + + validateAndSetupPatternStr(evaluator, args); + validateAndSetupSymbolInfo(evaluator, args, argsNum); + validateAndSetupResultExprStr(evaluator, args, argsNum); + setupSymbolFunctionChain(evaluator); + + /* + * setup OI for input to resultExpr select list + */ + RowResolver selectListInputRR = RegexPath.createSelectListRR(evaluator, tDef.getInput()); + + /* + * parse ResultExpr Str and setup OI. + */ + ResultExpressionParser resultExprParser = + new ResultExpressionParser(evaluator.resultExprStr, selectListInputRR); + try { + resultExprParser.translate(); + } + catch(HiveException he) { + throw new SemanticException(he); + } + evaluator.resultExprInfo = resultExprParser.getResultExprInfo(); + StructObjectInspector OI = evaluator.resultExprInfo.resultOI; + + setOutputOI(OI); + } + /* + * validate and setup patternStr + */ + private void validateAndSetupPatternStr(RegexPath evaluator, + ArrayList args) throws SemanticException { + PTFExpressionDef symboPatternArg = args.get(0); + ObjectInspector symbolPatternArgOI = symboPatternArg.getOI(); + + if ( !ObjectInspectorUtils.isConstantObjectInspector(symbolPatternArgOI) || + (symbolPatternArgOI.getCategory() != ObjectInspector.Category.PRIMITIVE) || + ((PrimitiveObjectInspector)symbolPatternArgOI).getPrimitiveCategory() != + PrimitiveObjectInspector.PrimitiveCategory.STRING ) + { + throwErrorWithSignature("Currently the symbol Pattern must be a Constant String."); + } + + evaluator.patternStr = ((ConstantObjectInspector)symbolPatternArgOI). + getWritableConstantValue().toString(); + } + + /* + * validate and setup SymbolInfo + */ + private void validateAndSetupSymbolInfo(RegexPath evaluator, + ArrayList args, + int argsNum) throws SemanticException { + int symbolArgsSz = argsNum - 2; + if ( symbolArgsSz % 2 != 0) + { + throwErrorWithSignature("Symbol Name, Expression need to be specified in pairs: " + + "there are odd number of symbol args"); + } + + evaluator.symInfo = new SymbolsInfo(symbolArgsSz/2); + for(int i=1; i <= symbolArgsSz; i += 2) + { + PTFExpressionDef symbolNameArg = args.get(i); + ObjectInspector symbolNameArgOI = symbolNameArg.getOI(); + + if ( !ObjectInspectorUtils.isConstantObjectInspector(symbolNameArgOI) || + (symbolNameArgOI.getCategory() != ObjectInspector.Category.PRIMITIVE) || + ((PrimitiveObjectInspector)symbolNameArgOI).getPrimitiveCategory() != + PrimitiveObjectInspector.PrimitiveCategory.STRING ) + { + throwErrorWithSignature( + String.format("Currently a Symbol Name(%s) must be a Constant String", + symbolNameArg.getExpressionTreeString())); + } + String symbolName = ((ConstantObjectInspector)symbolNameArgOI). + getWritableConstantValue().toString(); + + PTFExpressionDef symolExprArg = args.get(i+1); + ObjectInspector symolExprArgOI = symolExprArg.getOI(); + if ( (symolExprArgOI.getCategory() != ObjectInspector.Category.PRIMITIVE) || + ((PrimitiveObjectInspector)symolExprArgOI).getPrimitiveCategory() != + PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN ) + { + throwErrorWithSignature(String.format("Currently a Symbol Expression(%s) " + + "must be a boolean expression", symolExprArg.getExpressionTreeString())); + } + evaluator.symInfo.add(symbolName, symolExprArg); + } + } + + /* + * validate and setup resultExprStr + */ + private void validateAndSetupResultExprStr(RegexPath evaluator, + ArrayList args, + int argsNum) throws SemanticException { + PTFExpressionDef resultExprArg = args.get(argsNum - 1); + ObjectInspector resultExprArgOI = resultExprArg.getOI(); + + if ( !ObjectInspectorUtils.isConstantObjectInspector(resultExprArgOI) || + (resultExprArgOI.getCategory() != ObjectInspector.Category.PRIMITIVE) || + ((PrimitiveObjectInspector)resultExprArgOI).getPrimitiveCategory() != + PrimitiveObjectInspector.PrimitiveCategory.STRING ) + { + throwErrorWithSignature("Currently the result Expr parameter must be a Constant String."); + } + + evaluator.resultExprStr = ((ConstantObjectInspector)resultExprArgOI). + getWritableConstantValue().toString(); + } + + /* + * setup SymbolFunction chain. + */ + private void setupSymbolFunctionChain(RegexPath evaluator) throws SemanticException { + SymbolParser syP = new SymbolParser(evaluator.patternStr, + evaluator.symInfo.symbolExprsNames, + evaluator.symInfo.symbolExprsEvaluators, evaluator.symInfo.symbolExprsOIs); + syP.parse(); + evaluator.syFn = syP.getSymbolFunction(); + } + + @Override + public boolean transformsRawInput() + { + return false; + } + + @Override + public void initializeOutputOI() throws HiveException { + try { + RegexPath evaluator = (RegexPath) getEvaluator(); + PartitionedTableFunctionDef tDef = evaluator.getTableDef(); + + ArrayList args = tDef.getArgs(); + int argsNum = args.size(); + + validateAndSetupPatternStr(evaluator, args); + validateAndSetupSymbolInfo(evaluator, args, argsNum); + validateAndSetupResultExprStr(evaluator, args, argsNum); + setupSymbolFunctionChain(evaluator); + + /* + * setup OI for input to resultExpr select list + */ + StructObjectInspector selectListInputOI = RegexPath.createSelectListOI( evaluator, + tDef.getInput()); + ResultExprInfo resultExprInfo = evaluator.resultExprInfo; + ArrayList selectListExprOIs = new ArrayList(); + resultExprInfo.resultExprEvals = new ArrayList(); + + for(int i=0 ; i < resultExprInfo.resultExprNodes.size(); i++) { + ExprNodeDesc selectColumnExprNode =resultExprInfo.resultExprNodes.get(i); + ExprNodeEvaluator selectColumnExprEval = + ExprNodeEvaluatorFactory.get(selectColumnExprNode); + ObjectInspector selectColumnOI = selectColumnExprEval.initialize(selectListInputOI); + resultExprInfo.resultExprEvals.add(selectColumnExprEval); + selectListExprOIs.add(selectColumnOI); + } + + resultExprInfo.resultOI = ObjectInspectorFactory.getStandardStructObjectInspector( + resultExprInfo.resultExprNames, selectListExprOIs); + setOutputOI(resultExprInfo.resultOI); + } + catch(SemanticException se) { + throw new HiveException(se); + } + } + + @Override + public ArrayList getOutputColumnNames() { + RegexPath evaluator = (RegexPath) getEvaluator(); + return evaluator.resultExprInfo.getResultExprNames(); + } + + } + + public ResultExprInfo getResultExprInfo() { + return resultExprInfo; + } + + public void setResultExprInfo(ResultExprInfo resultExprInfo) { + this.resultExprInfo = resultExprInfo; + } + + static class SymbolsInfo { + int sz; + ArrayList symbolExprsEvaluators; + ArrayList symbolExprsOIs; + ArrayList symbolExprsNames; + + SymbolsInfo(int sz) + { + this.sz = sz; + symbolExprsEvaluators = new ArrayList(sz); + symbolExprsOIs = new ArrayList(sz); + symbolExprsNames = new ArrayList(sz); + } + + void add(String name, PTFExpressionDef arg) + { + symbolExprsNames.add(name); + symbolExprsEvaluators.add(arg.getExprEvaluator()); + symbolExprsOIs.add(arg.getOI()); + } + } + + public static class ResultExprInfo { + ArrayList resultExprNames; + ArrayList resultExprNodes; + private transient ArrayList resultExprEvals; + private transient StructObjectInspector resultOI; + + public ArrayList getResultExprNames() { + return resultExprNames; + } + public void setResultExprNames(ArrayList resultExprNames) { + this.resultExprNames = resultExprNames; + } + public ArrayList getResultExprNodes() { + return resultExprNodes; + } + public void setResultExprNodes(ArrayList resultExprNodes) { + this.resultExprNodes = resultExprNodes; + } + } + + public static abstract class SymbolFunction + { + SymbolFunctionResult result; + + public SymbolFunction() + { + result = new SymbolFunctionResult(); + } + + public static SymbolFunctionResult match(SymbolFunction syFn, Object row, + PTFPartitionIterator pItr) throws HiveException + { + int resetToIdx = pItr.getIndex() - 1; + try + { + return syFn.match(row, pItr); + } finally + { + pItr.resetToIndex(resetToIdx); + } + } + + protected abstract SymbolFunctionResult match(Object row, PTFPartitionIterator pItr) + throws HiveException; + + protected abstract boolean isOptional(); + } + + public static class Symbol extends SymbolFunction { + ExprNodeEvaluator symbolExprEval; + Converter converter; + + public Symbol(ExprNodeEvaluator symbolExprEval, ObjectInspector symbolOI) + { + this.symbolExprEval = symbolExprEval; + converter = ObjectInspectorConverters.getConverter( + symbolOI, + PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + } + + @Override + protected SymbolFunctionResult match(Object row, PTFPartitionIterator pItr) + throws HiveException + { + Object val = null; + val = symbolExprEval.evaluate(row); + val = converter.convert(val); + result.matches = ((Boolean) val).booleanValue(); + result.nextRow = pItr.getIndex(); + + return result; + } + + @Override + protected boolean isOptional() + { + return false; + } + } + + public static class Star extends SymbolFunction { + SymbolFunction symbolFn; + + public Star(SymbolFunction symbolFn) + { + this.symbolFn = symbolFn; + } + + @Override + protected SymbolFunctionResult match(Object row, PTFPartitionIterator pItr) + throws HiveException + { + result.matches = true; + SymbolFunctionResult rowResult = symbolFn.match(row, pItr); + + while (rowResult.matches && pItr.hasNext()) + { + row = pItr.next(); + rowResult = symbolFn.match(row, pItr); + } + + result.nextRow = pItr.getIndex() - 1; + return result; + } + + @Override + protected boolean isOptional() + { + return true; + } + } + + public static class Plus extends SymbolFunction { + SymbolFunction symbolFn; + + public Plus(SymbolFunction symbolFn) + { + this.symbolFn = symbolFn; + } + + @Override + protected SymbolFunctionResult match(Object row, PTFPartitionIterator pItr) + throws HiveException + { + SymbolFunctionResult rowResult = symbolFn.match(row, pItr); + + if (!rowResult.matches) + { + result.matches = false; + result.nextRow = pItr.getIndex() - 1; + return result; + } + + result.matches = true; + while (rowResult.matches && pItr.hasNext()) + { + row = pItr.next(); + rowResult = symbolFn.match(row, pItr); + } + + result.nextRow = pItr.getIndex() - 1; + return result; + } + + @Override + protected boolean isOptional() + { + return false; + } + } + + public static class Chain extends SymbolFunction + { + ArrayList components; + + public Chain(ArrayList components) + { + this.components = components; + } + + /* + * Iterate over the Symbol Functions in the Chain: + * - If we are not at the end of the Iterator (i.e. row != null ) + * - match the current componentFn + * - if it returns false, then return false + * - otherwise set row to the next row from the Iterator. + * - if we are at the end of the Iterator + * - skip any optional Symbol Fns (star patterns) at the end. + * - but if we come to a non optional Symbol Fn, return false. + * - if we match all Fns in the chain return true. + */ + @Override + protected SymbolFunctionResult match(Object row, PTFPartitionIterator pItr) + throws HiveException + { + SymbolFunctionResult componentResult = null; + for (SymbolFunction sFn : components) + { + if (row != null) + { + componentResult = sFn.match(row, pItr); + if (!componentResult.matches) + { + result.matches = false; + result.nextRow = componentResult.nextRow; + return result; + } + row = pItr.resetToIndex(componentResult.nextRow); + } + else + { + if (!sFn.isOptional()) + { + result.matches = false; + result.nextRow = componentResult.nextRow; + return result; + } + } + } + + result.matches = true; + result.nextRow = componentResult.nextRow; + return result; + } + + @Override + protected boolean isOptional() + { + return false; + } + } + + + public static class SymbolFunctionResult + { + /* + * does the row match the pattern represented by this SymbolFunction + */ + public boolean matches; + /* + * what is the index of the row beyond the set of rows that match this pattern. + */ + public int nextRow; + } + + public static class SymbolParser + { + String patternStr; + String[] symbols; + HashMap symbolExprEvalMap; + ArrayList symbolFunctions; + Chain symbolFnChain; + + + public SymbolParser(String patternStr, ArrayList symbolNames, + ArrayList symbolExprEvals, ArrayList symbolExprOIs) + { + super(); + this.patternStr = patternStr; + symbolExprEvalMap = new HashMap(); + int sz = symbolNames.size(); + for(int i=0; i < sz; i++) + { + String symbolName = symbolNames.get(i); + ExprNodeEvaluator symbolExprEval = symbolExprEvals.get(i); + ObjectInspector symbolExprOI = symbolExprOIs.get(i); + symbolExprEvalMap.put(symbolName.toLowerCase(), + new Object[] {symbolExprEval, symbolExprOI}); + } + } + + public SymbolFunction getSymbolFunction() + { + return symbolFnChain; + } + + public void parse() throws SemanticException + { + symbols = patternStr.split("\\."); + symbolFunctions = new ArrayList(); + + for(String symbol : symbols) + { + boolean isStar = symbol.endsWith("*"); + boolean isPlus = symbol.endsWith("+"); + + symbol = (isStar || isPlus) ? symbol.substring(0, symbol.length() - 1) : symbol; + Object[] symbolDetails = symbolExprEvalMap.get(symbol.toLowerCase()); + if ( symbolDetails == null ) + { + throw new SemanticException(String.format("Unknown Symbol %s", symbol)); + } + + ExprNodeEvaluator symbolExprEval = (ExprNodeEvaluator) symbolDetails[0]; + ObjectInspector symbolExprOI = (ObjectInspector) symbolDetails[1]; + SymbolFunction sFn = new Symbol(symbolExprEval, symbolExprOI); + + if ( isStar ) + { + sFn = new Star(sFn); + } + else if ( isPlus ) + { + sFn = new Plus(sFn); + } + symbolFunctions.add(sFn); + } + symbolFnChain = new Chain(symbolFunctions); + } + } + + /* + * ResultExpression is a Select List with the following variation: + * - the select keyword is optional. The parser checks if the expression doesn't start with + * select; if not it prefixes it. + * - Window Fn clauses are not permitted. + * - expressions can operate on the input columns plus the psuedo column 'path' + * which is array of + * structs. The shape of the struct is + * the same as the input. + */ + public static class ResultExpressionParser { + String resultExprString; + + RowResolver selectListInputRowResolver; + TypeCheckCtx selectListInputTypeCheckCtx; + StructObjectInspector selectListInputOI; + + ArrayList selectSpec; + + ResultExprInfo resultExprInfo; + + public ResultExpressionParser(String resultExprString, + RowResolver selectListInputRowResolver) + { + this.resultExprString = resultExprString; + this.selectListInputRowResolver = selectListInputRowResolver; + } + + public void translate() throws SemanticException, HiveException + { + setupSelectListInputInfo(); + fixResultExprString(); + parse(); + validateSelectExpr(); + buildSelectListEvaluators(); + } + + public ResultExprInfo getResultExprInfo() { + return resultExprInfo; + } + + private void buildSelectListEvaluators() throws SemanticException, HiveException + { + resultExprInfo = new ResultExprInfo(); + resultExprInfo.resultExprEvals = new ArrayList(); + resultExprInfo.resultExprNames = new ArrayList(); + resultExprInfo.resultExprNodes = new ArrayList(); + //result + ArrayList selectListExprOIs = new ArrayList(); + int i = 0; + for(WindowExpressionSpec expr : selectSpec) + { + String selectColName = expr.getAlias(); + ASTNode selectColumnNode = expr.getExpression(); + ExprNodeDesc selectColumnExprNode = + ResultExpressionParser.buildExprNode(selectColumnNode, + selectListInputTypeCheckCtx); + ExprNodeEvaluator selectColumnExprEval = + ExprNodeEvaluatorFactory.get(selectColumnExprNode); + ObjectInspector selectColumnOI = null; + selectColumnOI = selectColumnExprEval.initialize(selectListInputOI); + + selectColName = getColumnName(selectColName, selectColumnExprNode, i); + + resultExprInfo.resultExprEvals.add(selectColumnExprEval); + selectListExprOIs.add(selectColumnOI); + resultExprInfo.resultExprNodes.add(selectColumnExprNode); + resultExprInfo.resultExprNames.add(selectColName); + i++; + } + + resultExprInfo.resultOI = ObjectInspectorFactory.getStandardStructObjectInspector( + resultExprInfo.resultExprNames, selectListExprOIs); + } + + private void setupSelectListInputInfo() throws SemanticException + { + selectListInputTypeCheckCtx = new TypeCheckCtx(selectListInputRowResolver); + selectListInputTypeCheckCtx.setUnparseTranslator(null); + /* + * create SelectListOI + */ + selectListInputOI = (StructObjectInspector) + PTFTranslator.getStandardStructOI(selectListInputRowResolver); + } + + private void fixResultExprString() + { + String r = resultExprString.trim(); + String prefix = r.substring(0, 6); + if (!prefix.toLowerCase().equals("select")) + { + r = "select " + r; + } + resultExprString = r; + } + + private void parse() throws SemanticException + { + selectSpec = SemanticAnalyzer.parseSelect(resultExprString); + } + + private void validateSelectExpr() throws SemanticException + { + for (WindowExpressionSpec expr : selectSpec) + { + PTFTranslator.validateNoLeadLagInValueBoundarySpec(expr.getExpression()); + } + } + + private String getColumnName(String alias, ExprNodeDesc exprNode, int colIdx) + { + if (alias != null) + { + return alias; + } + else if (exprNode instanceof ExprNodeColumnDesc) + { + ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) exprNode; + return colDesc.getColumn(); + } + return "regexpath_col_" + colIdx; + } + + public static ExprNodeDesc buildExprNode(ASTNode expr, + TypeCheckCtx typeCheckCtx) throws SemanticException + { + // todo: use SemanticAnalyzer::genExprNodeDesc + // currently SA not available to PTFTranslator. + Map map = TypeCheckProcFactory + .genExprNode(expr, typeCheckCtx); + ExprNodeDesc desc = map.get(expr); + if (desc == null) { + String errMsg = typeCheckCtx.getError(); + if ( errMsg == null) { + errMsg = "Error in parsing "; + } + throw new SemanticException(errMsg); + } + return desc; + } + } + + public static final String PATHATTR_NAME = "tpath"; + + /* + * add array to the list of columns + */ + protected static RowResolver createSelectListRR(RegexPath evaluator, + PTFInputDef inpDef) throws SemanticException { + RowResolver rr = new RowResolver(); + RowResolver inputRR = inpDef.getOutputShape().getRr(); + + evaluator.inputColumnNamesMap = new HashMap(); + ArrayList inputColumnNames = new ArrayList(); + + ArrayList inpColOIs = new ArrayList(); + + for (ColumnInfo inpCInfo : inputRR.getColumnInfos()) { + ColumnInfo cInfo = new ColumnInfo(inpCInfo); + String colAlias = cInfo.getAlias(); + + String[] tabColAlias = inputRR.reverseLookup(inpCInfo.getInternalName()); + if (tabColAlias != null) { + colAlias = tabColAlias[1]; + } + ASTNode inExpr = null; + inExpr = PTFTranslator.getASTNode(inpCInfo, inputRR); + if ( inExpr != null ) { + rr.putExpression(inExpr, cInfo); + colAlias = inExpr.toStringTree().toLowerCase(); + } + else { + colAlias = colAlias == null ? cInfo.getInternalName() : colAlias; + rr.put(cInfo.getTabAlias(), colAlias, cInfo); + } + + evaluator.inputColumnNamesMap.put(cInfo.getInternalName(), colAlias); + inputColumnNames.add(colAlias); + inpColOIs.add(cInfo.getObjectInspector()); + } + + StandardListObjectInspector pathAttrOI = + ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardStructObjectInspector(inputColumnNames, + inpColOIs)); + + ColumnInfo pathColumn = new ColumnInfo(PATHATTR_NAME, + TypeInfoUtils.getTypeInfoFromObjectInspector(pathAttrOI), + null, + false, false); + rr.put(null, PATHATTR_NAME, pathColumn); + + return rr; + } + + protected static StructObjectInspector createSelectListOI(RegexPath evaluator, PTFInputDef inpDef) { + StructObjectInspector inOI = inpDef.getOutputShape().getOI(); + ArrayList inputColumnNames = new ArrayList(); + ArrayList selectListNames = new ArrayList(); + ArrayList fieldOIs = new ArrayList(); + for(StructField f : inOI.getAllStructFieldRefs()) { + String inputColName = evaluator.inputColumnNamesMap.get(f.getFieldName()); + if ( inputColName != null ) { + inputColumnNames.add(inputColName); + selectListNames.add(f.getFieldName()); + fieldOIs.add(f.getFieldObjectInspector()); + } + } + + StandardListObjectInspector pathAttrOI = + ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardStructObjectInspector(inputColumnNames, + fieldOIs)); + + ArrayList selectFieldOIs = new ArrayList(); + selectFieldOIs.addAll(fieldOIs); + selectFieldOIs.add(pathAttrOI); + selectListNames.add(RegexPath.PATHATTR_NAME); + return ObjectInspectorFactory.getStandardStructObjectInspector( + selectListNames, selectFieldOIs); + } + + public static Object getSelectListInput(Object currRow, ObjectInspector rowOI, + PTFPartitionIterator pItr, int sz) throws HiveException { + ArrayList oRow = new ArrayList(); + List currRowAsStdObject = (List) ObjectInspectorUtils + .copyToStandardObject(currRow, rowOI); + oRow.addAll(currRowAsStdObject); + oRow.add(getPath(currRow, rowOI, pItr, sz)); + return oRow; + } + + public static ArrayList getPath(Object currRow, ObjectInspector rowOI, + PTFPartitionIterator pItr, int sz) throws HiveException { + int idx = pItr.getIndex() - 1; + ArrayList path = new ArrayList(); + path.add(ObjectInspectorUtils.copyToStandardObject(currRow, rowOI)); + int pSz = 1; + + while (pSz < sz && pItr.hasNext()) + { + currRow = pItr.next(); + path.add(ObjectInspectorUtils.copyToStandardObject(currRow, rowOI)); + pSz++; + } + pItr.resetToIndex(idx); + return path; + } +} Index: ql/src/test/queries/clientpositive/ptf_regexpath.q =================================================================== --- ql/src/test/queries/clientpositive/ptf_regexpath.q (revision 0) +++ ql/src/test/queries/clientpositive/ptf_regexpath.q (working copy) @@ -0,0 +1,36 @@ +DROP TABLE IF EXISTS flights_tiny; + +create table flights_tiny ( +ORIGIN_CITY_NAME string, +DEST_CITY_NAME string, +YEAR int, +MONTH int, +DAY_OF_MONTH int, +ARR_DELAY float, +FL_NUM string +); + +LOAD DATA LOCAL INPATH '../data/files/flights_tiny.txt' OVERWRITE INTO TABLE flights_tiny; + +-- 1. basic Regexpath test +select origin_city_name, fl_num, year, month, day_of_month, sz, tpath +from regex_path(on + flights_tiny + distribute by fl_num + sort by year, month, day_of_month + arg1('LATE.LATE+'), + arg2('LATE'), arg3(arr_delay > 15), + arg4('origin_city_name, fl_num, year, month, day_of_month, size(tpath) as sz, tpath[0].day_of_month as tpath') + ); + +-- 2. regex_path on 1 partition +select origin_city_name, fl_num, year, month, day_of_month, sz, tpath +from regex_path(on + flights_tiny + sort by fl_num, year, month, day_of_month + arg1('LATE.LATE+'), + arg2('LATE'), arg3(arr_delay > 15), + arg4('origin_city_name, fl_num, year, month, day_of_month, size(tpath) as sz, tpath[0].day_of_month as tpath') + ) +where fl_num = 1142; + Index: ql/src/test/queries/clientpositive/ptf_register_tblfn.q =================================================================== --- ql/src/test/queries/clientpositive/ptf_register_tblfn.q (revision 1520834) +++ ql/src/test/queries/clientpositive/ptf_register_tblfn.q (working copy) @@ -1,4 +1,4 @@ -DROP TABLE flights_tiny; +DROP TABLE IF EXISTS flights_tiny; create table flights_tiny ( ORIGIN_CITY_NAME string, @@ -12,12 +12,12 @@ LOAD DATA LOCAL INPATH '../data/files/flights_tiny.txt' OVERWRITE INTO TABLE flights_tiny; -create temporary function npathtest as 'org.apache.hadoop.hive.ql.udf.ptf.NPath$NPathResolver'; +create temporary function regexpathtest as 'org.apache.hadoop.hive.ql.udf.ptf.RegexPath$RegexPathResolver'; -- 1. basic Npath test select origin_city_name, fl_num, year, month, day_of_month, sz, tpath -from npathtest(on +from regexpathtest(on flights_tiny distribute by fl_num sort by year, month, day_of_month @@ -26,4 +26,4 @@ arg4('origin_city_name, fl_num, year, month, day_of_month, size(tpath) as sz, tpath[0].day_of_month as tpath') ); -drop temporary function npathtest; +drop temporary function regexpathtest; Index: ql/src/test/results/clientpositive/ptf_regexpath.q.out =================================================================== --- ql/src/test/results/clientpositive/ptf_regexpath.q.out (revision 0) +++ ql/src/test/results/clientpositive/ptf_regexpath.q.out (working copy) @@ -0,0 +1,104 @@ +PREHOOK: query: DROP TABLE IF EXISTS flights_tiny +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE IF EXISTS flights_tiny +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table flights_tiny ( +ORIGIN_CITY_NAME string, +DEST_CITY_NAME string, +YEAR int, +MONTH int, +DAY_OF_MONTH int, +ARR_DELAY float, +FL_NUM string +) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table flights_tiny ( +ORIGIN_CITY_NAME string, +DEST_CITY_NAME string, +YEAR int, +MONTH int, +DAY_OF_MONTH int, +ARR_DELAY float, +FL_NUM string +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@flights_tiny +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/flights_tiny.txt' OVERWRITE INTO TABLE flights_tiny +PREHOOK: type: LOAD +PREHOOK: Output: default@flights_tiny +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/flights_tiny.txt' OVERWRITE INTO TABLE flights_tiny +POSTHOOK: type: LOAD +POSTHOOK: Output: default@flights_tiny +PREHOOK: query: -- 1. basic Regexpath test +select origin_city_name, fl_num, year, month, day_of_month, sz, tpath +from regex_path(on + flights_tiny + distribute by fl_num + sort by year, month, day_of_month + arg1('LATE.LATE+'), + arg2('LATE'), arg3(arr_delay > 15), + arg4('origin_city_name, fl_num, year, month, day_of_month, size(tpath) as sz, tpath[0].day_of_month as tpath') + ) +PREHOOK: type: QUERY +PREHOOK: Input: default@flights_tiny +#### A masked pattern was here #### +POSTHOOK: query: -- 1. basic Regexpath test +select origin_city_name, fl_num, year, month, day_of_month, sz, tpath +from regex_path(on + flights_tiny + distribute by fl_num + sort by year, month, day_of_month + arg1('LATE.LATE+'), + arg2('LATE'), arg3(arr_delay > 15), + arg4('origin_city_name, fl_num, year, month, day_of_month, size(tpath) as sz, tpath[0].day_of_month as tpath') + ) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@flights_tiny +#### A masked pattern was here #### +Baltimore 1142 2010 10 20 6 20 +Baltimore 1142 2010 10 21 5 21 +Baltimore 1142 2010 10 22 4 22 +Baltimore 1142 2010 10 25 3 25 +Baltimore 1142 2010 10 26 2 26 +Chicago 1531 2010 10 21 2 21 +Chicago 1531 2010 10 25 3 25 +Chicago 1531 2010 10 26 2 26 +Baltimore 1599 2010 10 21 2 21 +Baltimore 1599 2010 10 25 3 25 +Baltimore 1599 2010 10 26 2 26 +Chicago 361 2010 10 20 2 20 +Washington 7291 2010 10 27 2 27 +Chicago 897 2010 10 20 4 20 +Chicago 897 2010 10 21 3 21 +Chicago 897 2010 10 22 2 22 +PREHOOK: query: -- 2. regex_path on 1 partition +select origin_city_name, fl_num, year, month, day_of_month, sz, tpath +from regex_path(on + flights_tiny + sort by fl_num, year, month, day_of_month + arg1('LATE.LATE+'), + arg2('LATE'), arg3(arr_delay > 15), + arg4('origin_city_name, fl_num, year, month, day_of_month, size(tpath) as sz, tpath[0].day_of_month as tpath') + ) +where fl_num = 1142 +PREHOOK: type: QUERY +PREHOOK: Input: default@flights_tiny +#### A masked pattern was here #### +POSTHOOK: query: -- 2. regex_path on 1 partition +select origin_city_name, fl_num, year, month, day_of_month, sz, tpath +from regex_path(on + flights_tiny + sort by fl_num, year, month, day_of_month + arg1('LATE.LATE+'), + arg2('LATE'), arg3(arr_delay > 15), + arg4('origin_city_name, fl_num, year, month, day_of_month, size(tpath) as sz, tpath[0].day_of_month as tpath') + ) +where fl_num = 1142 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@flights_tiny +#### A masked pattern was here #### +Baltimore 1142 2010 10 20 6 20 +Baltimore 1142 2010 10 21 5 21 +Baltimore 1142 2010 10 22 4 22 +Baltimore 1142 2010 10 25 3 25 +Baltimore 1142 2010 10 26 2 26 Index: ql/src/test/results/clientpositive/ptf_register_tblfn.q.out =================================================================== --- ql/src/test/results/clientpositive/ptf_register_tblfn.q.out (revision 1520834) +++ ql/src/test/results/clientpositive/ptf_register_tblfn.q.out (working copy) @@ -1,6 +1,6 @@ -PREHOOK: query: DROP TABLE flights_tiny +PREHOOK: query: DROP TABLE IF EXISTS flights_tiny PREHOOK: type: DROPTABLE -POSTHOOK: query: DROP TABLE flights_tiny +POSTHOOK: query: DROP TABLE IF EXISTS flights_tiny POSTHOOK: type: DROPTABLE PREHOOK: query: create table flights_tiny ( ORIGIN_CITY_NAME string, @@ -29,13 +29,13 @@ POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/flights_tiny.txt' OVERWRITE INTO TABLE flights_tiny POSTHOOK: type: LOAD POSTHOOK: Output: default@flights_tiny -PREHOOK: query: create temporary function npathtest as 'org.apache.hadoop.hive.ql.udf.ptf.NPath$NPathResolver' +PREHOOK: query: create temporary function regexpathtest as 'org.apache.hadoop.hive.ql.udf.ptf.RegexPath$RegexPathResolver' PREHOOK: type: CREATEFUNCTION -POSTHOOK: query: create temporary function npathtest as 'org.apache.hadoop.hive.ql.udf.ptf.NPath$NPathResolver' +POSTHOOK: query: create temporary function regexpathtest as 'org.apache.hadoop.hive.ql.udf.ptf.RegexPath$RegexPathResolver' POSTHOOK: type: CREATEFUNCTION PREHOOK: query: -- 1. basic Npath test select origin_city_name, fl_num, year, month, day_of_month, sz, tpath -from npathtest(on +from regexpathtest(on flights_tiny distribute by fl_num sort by year, month, day_of_month @@ -48,7 +48,7 @@ #### A masked pattern was here #### POSTHOOK: query: -- 1. basic Npath test select origin_city_name, fl_num, year, month, day_of_month, sz, tpath -from npathtest(on +from regexpathtest(on flights_tiny distribute by fl_num sort by year, month, day_of_month @@ -75,7 +75,7 @@ Chicago 897 2010 10 20 4 20 Chicago 897 2010 10 21 3 21 Chicago 897 2010 10 22 2 22 -PREHOOK: query: drop temporary function npathtest +PREHOOK: query: drop temporary function regexpathtest PREHOOK: type: DROPFUNCTION -POSTHOOK: query: drop temporary function npathtest +POSTHOOK: query: drop temporary function regexpathtest POSTHOOK: type: DROPFUNCTION Index: ql/src/test/results/clientpositive/show_functions.q.out =================================================================== --- ql/src/test/results/clientpositive/show_functions.q.out (revision 1520834) +++ ql/src/test/results/clientpositive/show_functions.q.out (working copy) @@ -116,7 +116,6 @@ noop noopwithmap not -npath ntile nvl or @@ -136,6 +135,7 @@ rank reflect reflect2 +regex_path regexp regexp_extract regexp_replace