diff --git ql/build.xml ql/build.xml index 46b4d23..a630db8 100644 --- ql/build.xml +++ ql/build.xml @@ -161,7 +161,7 @@ debug="${javac.debug}" deprecation="${javac.deprecation}" includeantruntime="false"> - + diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index fc0256c..08c0c7c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -93,6 +93,8 @@ import org.apache.hadoop.hive.ql.udf.UDFOPPositive; import org.apache.hadoop.hive.ql.udf.UDFPI; import org.apache.hadoop.hive.ql.udf.UDFParseUrl; +import org.apache.hadoop.hive.ql.udf.UDFUriEscape; +import org.apache.hadoop.hive.ql.udf.UDFUriUnescape; import org.apache.hadoop.hive.ql.udf.UDFPosMod; import org.apache.hadoop.hive.ql.udf.UDFPower; import org.apache.hadoop.hive.ql.udf.UDFRTrim; @@ -310,6 +312,8 @@ registerUDF("regexp_replace", UDFRegExpReplace.class, false); registerUDF("regexp_extract", UDFRegExpExtract.class, false); registerUDF("parse_url", UDFParseUrl.class, false); + registerUDF("uri_escape", UDFUriEscape.class, false); + registerUDF("uri_unescape", UDFUriUnescape.class, false); registerGenericUDF("nvl", GenericUDFNvl.class); registerGenericUDF("split", GenericUDFSplit.class); registerGenericUDF("str_to_map", GenericUDFStringToMap.class); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUriEscape.java ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUriEscape.java new file mode 100644 index 0000000..61d3f07 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUriEscape.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf; + +import java.net.URLEncoder; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.Text; + +/** + * UDFUriEscape. + * + */ +@Description(name = "uri_escape", + value = "_FUNC_(uri) - Returns the uri encoded with ASCII encoding", + extended = "Returns NULL if argument is NULL.\n " + + "Example:\n " + + " > SELECT _FUNC_('http://www.someurl.com/has spaces in url') FROM src LIMIT 1;\n" + + "'http%3A%2F%2Fwww.someurl.com%2Fhas%20spaces%20in%20url.%21%25%23'") +public class UDFUriEscape extends UDF { + private Text escapedUri = new Text(); + + public UDFUriEscape() { + + } + + /** + * Get the encoded form of uri from the uri string. + * + * @param input.input is the uri to be escaped. + * + * @return a uri encoded. null if the input is null. + */ + public Text evaluate(Text input) { + + if (input == null) + return null; + + try { + escapedUri.set(URLEncoder.encode(input.toString(), "ASCII").replace("+", "%20")); + return escapedUri; + }catch (Exception e) { + return null; + } + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUriUnescape.java ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUriUnescape.java new file mode 100644 index 0000000..9dc5ad5 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUriUnescape.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf; + +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.Text; + + +/** + * UDFUriUnescape. + * + */ +@Description(name = "uri_unescape", + value = "_FUNC_(uri) - Returns the uri decoded", + extended = "Returns NULL if argument is NULL.\n " + + "Example:\n " + + " > SELECT _FUNC_('http%3A%2F%2Fwww.someurl.com%2Fhas%20spaces%20in%20url.%21%25%23') " + + "FROM src LIMIT 1;\n" + + "'http://www.someurl.com/has spaces in url'") +public class UDFUriUnescape extends UDF { + private Text unEscapedUri = new Text(); + + public UDFUriUnescape() { + } + + /** + * Get the decoded form of uri from the encoded uri string. + * + * @param input + * input is the uri to be unescaped. + * @return uri unescaped. null if the input is null. + */ + public Text evaluate(Text input) { + + if (input == null) + return null; + + try { + unEscapedUri.set(URLDecoder.decode(input.toString(), "UTF-8")); + return unEscapedUri; + }catch (UnsupportedEncodingException e) { + return null; + } + } + +} diff --git ql/src/test/queries/clientpositive/udf_uri_escape.q ql/src/test/queries/clientpositive/udf_uri_escape.q new file mode 100644 index 0000000..eb4fc05 --- /dev/null +++ ql/src/test/queries/clientpositive/udf_uri_escape.q @@ -0,0 +1,9 @@ +DESCRIBE FUNCTION uri_escape; +DESCRIBE FUNCTION EXTENDED uri_escape; + +EXPLAIN +SELECT uri_escape('http://google.com/resource?key=value1 & value2') +FROM src LIMIT 1; + +SELECT uri_escape('http://google.com/resource?key=value1 & value2') +FROM src LIMIT 1; diff --git ql/src/test/queries/clientpositive/udf_uri_unescape.q ql/src/test/queries/clientpositive/udf_uri_unescape.q new file mode 100644 index 0000000..68aab6c --- /dev/null +++ ql/src/test/queries/clientpositive/udf_uri_unescape.q @@ -0,0 +1,9 @@ +DESCRIBE FUNCTION uri_unescape; +DESCRIBE FUNCTION EXTENDED uri_unescape; + +EXPLAIN +SELECT uri_unescape('http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2') +FROM src LIMIT 1; + +SELECT uri_unescape('http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2') +FROM src LIMIT 1; diff --git ql/src/test/results/clientpositive/udf_uri_escape.q.out ql/src/test/results/clientpositive/udf_uri_escape.q.out new file mode 100644 index 0000000..54c9202 --- /dev/null +++ ql/src/test/results/clientpositive/udf_uri_escape.q.out @@ -0,0 +1,65 @@ +PREHOOK: query: DESCRIBE FUNCTION uri_escape +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION uri_escape +POSTHOOK: type: DESCFUNCTION +uri_escape(uri) - Returns the uri encoded with ASCII encoding +PREHOOK: query: DESCRIBE FUNCTION EXTENDED uri_escape +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION EXTENDED uri_escape +POSTHOOK: type: DESCFUNCTION +uri_escape(uri) - Returns the uri encoded with ASCII encoding +Returns NULL if argument is NULL. + Example: + > SELECT uri_escape('http://www.someurl.com/has spaces in url') FROM src LIMIT 1; +'http%3A%2F%2Fwww.someurl.com%2Fhas%20spaces%20in%20url.%21%25%23' +PREHOOK: query: EXPLAIN +SELECT uri_escape('http://google.com/resource?key=value1 & value2') +FROM src LIMIT 1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT uri_escape('http://google.com/resource?key=value1 & value2') +FROM src LIMIT 1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION uri_escape 'http://google.com/resource?key=value1 & value2'))) (TOK_LIMIT 1))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: uri_escape('http://google.com/resource?key=value1 & value2') + type: string + outputColumnNames: _col0 + Limit + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: 1 + + +PREHOOK: query: SELECT uri_escape('http://google.com/resource?key=value1 & value2') +FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT uri_escape('http://google.com/resource?key=value1 & value2') +FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2 diff --git ql/src/test/results/clientpositive/udf_uri_unescape.q.out ql/src/test/results/clientpositive/udf_uri_unescape.q.out new file mode 100644 index 0000000..c4458e5 --- /dev/null +++ ql/src/test/results/clientpositive/udf_uri_unescape.q.out @@ -0,0 +1,65 @@ +PREHOOK: query: DESCRIBE FUNCTION uri_unescape +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION uri_unescape +POSTHOOK: type: DESCFUNCTION +uri_unescape(uri) - Returns the uri decoded +PREHOOK: query: DESCRIBE FUNCTION EXTENDED uri_unescape +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION EXTENDED uri_unescape +POSTHOOK: type: DESCFUNCTION +uri_unescape(uri) - Returns the uri decoded +Returns NULL if argument is NULL. + Example: + > SELECT uri_unescape('http%3A%2F%2Fwww.someurl.com%2Fhas%20spaces%20in%20url.%21%25%23') FROM src LIMIT 1; +'http://www.someurl.com/has spaces in url' +PREHOOK: query: EXPLAIN +SELECT uri_unescape('http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2') +FROM src LIMIT 1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT uri_unescape('http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2') +FROM src LIMIT 1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION uri_unescape 'http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2'))) (TOK_LIMIT 1))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: uri_unescape('http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2') + type: string + outputColumnNames: _col0 + Limit + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: 1 + + +PREHOOK: query: SELECT uri_unescape('http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2') +FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT uri_unescape('http%3A%2F%2Fgoogle.com%2Fresource%3Fkey%3Dvalue1%20%26%20value2') +FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +http://google.com/resource?key=value1 & value2