diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSplit.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSplit.java index 1fe611d..c2926b6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSplit.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSplit.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.udf.generic; import java.util.ArrayList; +import java.util.regex.Pattern; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -40,6 +41,10 @@ + " [\"one\", \"two\", \"three\"]") public class GenericUDFSplit extends GenericUDF { private transient ObjectInspectorConverters.Converter[] converters; + private transient Text lastRegex; + private transient Pattern pattern; + private transient boolean isSimpleRegex; + private static final Pattern simpleRegexPattern = Pattern.compile("[a-zA-Z0-9 ,%#@!&=\"';:']"); @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { @@ -70,10 +75,63 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { Text s = (Text) converters[0].convert(arguments[0].get()); Text regex = (Text) converters[1].convert(arguments[1].get()); + // If it received a new regular expression + if (lastRegex != null || !regex.equals(lastRegex)) { + String regexString = regex.toString(); + if (simpleRegexPattern.matcher(regexString).matches()) { + isSimpleRegex = true; + pattern = null; + } else { + isSimpleRegex = false; + pattern = Pattern.compile(regexString); + } + lastRegex = regex; + } + ArrayList result = new ArrayList(); - for (String str : s.toString().split(regex.toString())) { - result.add(new Text(str)); + if (isSimpleRegex) { + byte[] sBytes = s.getBytes(); + byte[] regexBytes = regex.getBytes(); + int from = 0; + int to; + int j; + boolean found; + + for (int i = 0; i < sBytes.length; i++) { + // Try to match + if (i + regexBytes.length > sBytes.length) { + break; + } + found = true; + for (j = 0; j < regexBytes.length; j++) { + if (sBytes[i + j] != regexBytes[j]) { + found = false; + break; + } + } + + // Add found + if (found) { + to = i; + Text text = new Text(); + text.set(sBytes, from, to - from); + result.add(text); + i += j; + from = i; + } + } + + // Add remaining + if (from < sBytes.length) { + Text text = new Text(); + text.set(sBytes, from, sBytes.length - from); + result.add(text); + } + } else { + for (String str : pattern.split(s.toString())) { + result.add(new Text(str)); + } } return result; diff --git ql/src/test/queries/clientpositive/udf_split.q ql/src/test/queries/clientpositive/udf_split.q index f799017..f3f8492 100644 --- ql/src/test/queries/clientpositive/udf_split.q +++ ql/src/test/queries/clientpositive/udf_split.q @@ -3,6 +3,7 @@ DESCRIBE FUNCTION EXTENDED split; EXPLAIN SELECT split('a b c', ' '), + split('aaa__bb__c', '__'), split('oneAtwoBthreeC', '[ABC]'), split('', '.'), split(50401020, 0) @@ -10,6 +11,7 @@ FROM src LIMIT 1; SELECT split('a b c', ' '), + split('aaa__bb__c', '__'), split('oneAtwoBthreeC', '[ABC]'), split('', '.'), split(50401020, 0) diff --git ql/src/test/results/clientpositive/udf_split.q.out ql/src/test/results/clientpositive/udf_split.q.out index 34a70e0..9e2f2d0 100644 --- ql/src/test/results/clientpositive/udf_split.q.out +++ ql/src/test/results/clientpositive/udf_split.q.out @@ -13,6 +13,7 @@ Example: ["one", "two", "three"] PREHOOK: query: EXPLAIN SELECT split('a b c', ' '), + split('aaa__bb__c', '__'), split('oneAtwoBthreeC', '[ABC]'), split('', '.'), split(50401020, 0) @@ -20,13 +21,14 @@ FROM src LIMIT 1 PREHOOK: type: QUERY POSTHOOK: query: EXPLAIN SELECT split('a b c', ' '), + split('aaa__bb__c', '__'), split('oneAtwoBthreeC', '[ABC]'), split('', '.'), split(50401020, 0) FROM src LIMIT 1 POSTHOOK: type: QUERY ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION split 'a b c' ' ')) (TOK_SELEXPR (TOK_FUNCTION split 'oneAtwoBthreeC' '[ABC]')) (TOK_SELEXPR (TOK_FUNCTION split '' '.')) (TOK_SELEXPR (TOK_FUNCTION split 50401020 0))) (TOK_LIMIT 1))) + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION split 'a b c' ' ')) (TOK_SELEXPR (TOK_FUNCTION split 'aaa__bb__c' '__')) (TOK_SELEXPR (TOK_FUNCTION split 'oneAtwoBthreeC' '[ABC]')) (TOK_SELEXPR (TOK_FUNCTION split '' '.')) (TOK_SELEXPR (TOK_FUNCTION split 50401020 0))) (TOK_LIMIT 1))) STAGE DEPENDENCIES: Stage-1 is a root stage @@ -43,13 +45,15 @@ STAGE PLANS: expressions: expr: split('a b c', ' ') type: array + expr: split('aaa__bb__c', '__') + type: array expr: split('oneAtwoBthreeC', '[ABC]') type: array expr: split('', '.') type: array expr: split(50401020, 0) type: array - outputColumnNames: _col0, _col1, _col2, _col3 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 Limit File Output Operator compressed: false @@ -65,6 +69,7 @@ STAGE PLANS: PREHOOK: query: SELECT split('a b c', ' '), + split('aaa__bb__c', '__'), split('oneAtwoBthreeC', '[ABC]'), split('', '.'), split(50401020, 0) @@ -74,6 +79,7 @@ PREHOOK: Input: default@src #### A masked pattern was here #### POSTHOOK: query: SELECT split('a b c', ' '), + split('aaa__bb__c', '__'), split('oneAtwoBthreeC', '[ABC]'), split('', '.'), split(50401020, 0) @@ -81,4 +87,4 @@ FROM src LIMIT 1 POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -["a","b","c"] ["one","two","three"] [] ["5","4","1","2"] +["a","b","c"] ["aaa","bb","c"] ["one","two","three"] [] ["5","4","1","2"]