Index: ql/src/test/results/clientpositive/udf_split.q.out =================================================================== --- ql/src/test/results/clientpositive/udf_split.q.out (revision 0) +++ ql/src/test/results/clientpositive/udf_split.q.out (revision 0) @@ -0,0 +1,46 @@ +query: EXPLAIN SELECT + split('a b c', ' '), + split('oneAtwoBthreeC', '[ABC]'), + split('', '.') +FROM src LIMIT 1 +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION split 'a b c' ' ')) (TOK_SELEXPR (TOK_FUNCTION split 'oneAtwoBthreeC' '[ABC]')) (TOK_SELEXPR (TOK_FUNCTION split '' '.'))) (TOK_LIMIT 1))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + Select Operator + expressions: + expr: SPLIT('a b c', ' ') + type: array + expr: SPLIT('oneAtwoBthreeC', '[ABC]') + type: array + expr: SPLIT('', '.') + type: array + Limit + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: 1 + + +query: SELECT + split('a b c', ' '), + split('oneAtwoBthreeC', '[ABC]'), + split('', '.') +FROM src LIMIT 1 +Input: default/src +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/6715761/10000 +["a","b","c"] ["one","two","three"] [] Index: ql/src/test/queries/clientpositive/udf_split.q =================================================================== --- ql/src/test/queries/clientpositive/udf_split.q (revision 0) +++ ql/src/test/queries/clientpositive/udf_split.q (revision 0) @@ -0,0 +1,11 @@ +EXPLAIN SELECT + split('a b c', ' '), + split('oneAtwoBthreeC', '[ABC]'), + split('', '.') +FROM src LIMIT 1; + +SELECT + split('a b c', ' '), + split('oneAtwoBthreeC', '[ABC]'), + split('', '.') +FROM src LIMIT 1; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 795925) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -91,6 +91,7 @@ registerUDF("regexp_replace", UDFRegExpReplace.class, OperatorType.PREFIX, false); registerUDF("regexp_extract", UDFRegExpExtract.class, OperatorType.PREFIX, false); registerUDF("parse_url", UDFParseUrl.class, OperatorType.PREFIX, false); + registerGenericUDF("split", GenericUDFSplit.class); registerUDF("positive", UDFOPPositive.class, OperatorType.PREFIX, true, "+"); registerUDF("negative", UDFOPNegative.class, OperatorType.PREFIX, true, "-"); Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSplit.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSplit.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSplit.java (revision 0) @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import java.util.ArrayList; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; +import org.apache.hadoop.io.Text; + +public class GenericUDFSplit extends GenericUDF { + public ObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + if (arguments.length != 2) { + throw new UDFArgumentLengthException( + "The function SPLIT(s, regexp) takes exactly 2 arguments."); + } + + if(!(arguments[0] instanceof WritableStringObjectInspector)) { + throw new UDFArgumentTypeException(0, + "Arguments of SPLIT(s, regexp) must be strings."); + } + if(!(arguments[1] instanceof WritableStringObjectInspector)) { + throw new UDFArgumentTypeException(1, + "Arguments of SPLIT(s, regexp) must be strings."); + } + + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } + + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert(arguments.length == 2); + + Text s = PrimitiveObjectInspectorFactory.writableStringObjectInspector + .getPrimitiveWritableObject(arguments[0].get()); + Text regex = PrimitiveObjectInspectorFactory.writableStringObjectInspector + .getPrimitiveWritableObject(arguments[1].get()); + + ArrayList result = new ArrayList(); + + for(String str: s.toString().split(regex.toString())) { + result.add(new Text(str)); + } + + return result; + } + + public String getDisplayString(String[] children) { + assert(children.length == 2); + return "SPLIT(" + children[0] + ", " + children[1] + ")"; + } + +}