Index: ql/src/test/results/clientpositive/show_functions.q.out =================================================================== --- ql/src/test/results/clientpositive/show_functions.q.out (revision 1345772) +++ ql/src/test/results/clientpositive/show_functions.q.out (working copy) @@ -147,6 +147,7 @@ tan to_date to_utc_timestamp +translate trim ucase unhex @@ -215,6 +216,7 @@ size space to_date +translate ucase variance xpath_double Index: ql/src/test/queries/clientnegative/udf_translate_wrong1.q =================================================================== --- ql/src/test/queries/clientnegative/udf_translate_wrong1.q (revision 0) +++ ql/src/test/queries/clientnegative/udf_translate_wrong1.q (revision 0) @@ -0,0 +1,2 @@ +-- Invalid number of arguments +SELECT translate('-') FROM src LIMIT 1; Index: ql/src/test/queries/clientnegative/udf_translate_wrong2.q =================================================================== --- ql/src/test/queries/clientnegative/udf_translate_wrong2.q (revision 0) +++ ql/src/test/queries/clientnegative/udf_translate_wrong2.q (revision 0) @@ -0,0 +1,2 @@ +-- Invalid type of argument(integer instead of string) +SELECT translate('abcd', 1, 2) FROM src LIMIT 1; Index: ql/src/test/queries/clientnegative/udf_translate_wrong3.q =================================================================== --- ql/src/test/queries/clientnegative/udf_translate_wrong3.q (revision 0) +++ ql/src/test/queries/clientnegative/udf_translate_wrong3.q (revision 0) @@ -0,0 +1,2 @@ +-- Invalid type of argument(array instead of string) +SELECT translate(array('first', 'second', 'third'), 'abcd', '1234') FROM src LIMIT 1; Index: ql/src/test/queries/clientpositive/udf_translate.q =================================================================== --- ql/src/test/queries/clientpositive/udf_translate.q (revision 0) +++ ql/src/test/queries/clientpositive/udf_translate.q (revision 0) @@ -0,0 +1,37 @@ +DESCRIBE FUNCTION translate; +DESCRIBE FUNCTION EXTENDED translate; + +-- Create some tables to serve some input data +CREATE TABLE table_input(input STRING); +CREATE TABLE table_translate(input_string STRING, from_string STRING, to_string STRING); + +FROM src INSERT OVERWRITE TABLE table_input SELECT 'abcd' WHERE src.key = 86; +FROM src INSERT OVERWRITE TABLE table_translate SELECT 'abcd', 'ahd', '12' WHERE src.key = 86; + +-- Run some queries on constant input parameters +SELECT translate('abcd', 'ab', '12'), + translate('abcd', 'abc', '12') FROM src LIMIT 1; + +-- Run some queries where first parameter being a table column while the other two being constants +SELECT translate(table_input.input, 'ab', '12'), + translate(table_input.input, 'abc', '12') FROM table_input LIMIT 1; + +-- Run some queries where all parameters are coming from table columns +SELECT translate(input_string, from_string, to_string) FROM table_translate LIMIT 1; + +-- Run some queries where some parameters are NULL +SELECT translate(NULL, 'ab', '12'), + translate('abcd', NULL, '12'), + translate('abcd', 'ab', NULL), + translate(NULL, NULL, NULL) FROM src LIMIT 1; + +-- Run some queries where the same character appears several times in the from string (2nd argument) of the UDF +SELECT translate('abcd', 'aba', '123'), + translate('abcd', 'aba', '12') FROM src LIMIT 1; + +-- Run some queries for the ignorant case when the 3rd parameter has more characters than the second one +SELECT translate('abcd', 'abc', '1234') FROM src LIMIT 1; + +-- Test proper function over UTF-8 characters +SELECT translate('Àbcd', 'À', 'Ã') FROM src LIMIT 1; + Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 1345772) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -197,6 +197,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTranslate; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnion; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFWhen; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; @@ -305,6 +306,7 @@ registerUDF("parse_url", UDFParseUrl.class, false); registerGenericUDF("split", GenericUDFSplit.class); registerGenericUDF("str_to_map", GenericUDFStringToMap.class); + registerGenericUDF("translate", GenericUDFTranslate.class); registerUDF("positive", UDFOPPositive.class, true, "+"); registerUDF("negative", UDFOPNegative.class, true, "-"); Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTranslate.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTranslate.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTranslate.java (revision 0) @@ -0,0 +1,294 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; + +/** + * TRANSLATE(string input, string from, string to) is an equivalent function to + * translate in PostGresSQL. See explain extended annotation below to read more + * about how this UDF works + * + * @author mgrover + * + */ +@UDFType(deterministic = true) +//@formatter:off +@Description( + name = "translate", + value = "_FUNC_(input, from, to) - translates the input string by" + + " replacing the characters present in the from string with the" + + " corresponding characters in the to string", + extended = "_FUNC_(string input, string from, string to) is an" + + " equivalent function to translate in PostGreSQL. It works" + + " on a character by character basis on the input string (first" + + " parameter). A character in the input is checked for" + + " presence in the from string (second parameter). If a" + + " match happens, the character from to string (third " + + "parameter) which appears at the same index as the character" + + " in from string is obtained. This character is emitted in" + + " the output string instead of the original character from" + + " the input string. If the to string is shorter than the" + + " from string, there may not be a character present at" + + " the same index in the to string. In such a case, nothing is" + + " emitted for the original character and it's deleted from" + + " the output string." + + "\n" + + "For example," + + "\n" + + "\n" + + "_FUNC_('abcdef', 'adc', '19') returns '1b9ef' replacing" + + " 'a' with '1', 'd' with '9' and removing 'c' from the input" + + " string" + + "\n" + + "\n" + + "_FUNC_('a b c d', ' ', '') return 'abcd'" + + " removing all spaces from the input string" + + "\n" + + "\n" + + "If the same character is present multiple times in the" + + " input string, the first occurence of the character is the" + + " one that's considered for matching. However, it is not recommended" + + " to have the same character more than once in the from" + + " string since it's not required and adds to confusion." + + "\n" + + "\n" + + "For example," + + "\n" + + "\n" + + "_FUNC_('abcdef', 'ada', '192') returns '1bc9ef' replaces" + + " 'a' with '1' and 'd' with '9' ignoring the second" + + " occurence of 'a' in the from string mapping it to '2'" +) +//@formatter:on +public class GenericUDFTranslate extends GenericUDF { + + // For all practical purposes a code point is a fancy name for character. A + // java char data type can store characters that require 16 bits or less. + // However, the unicode specification has changed to allow for characters + // whose representation requires more than 16 bits. Therefore we need to + // represent each character (called a code point from hereon) as int + // More details at + // http://docs.oracle.com/javase/7/docs/api/java/lang/Character.html + + // If a code point needs to be replaced with + // another code point, this map with store the mapping. + private Map replacementMap = new HashMap(); + + // This set stores all the code points which needed to be deleted from the + // input string. The objects in deletionSet and keys in replacementMap are + // mutually exclusive + private Set deletionSet = new HashSet(); + private Text result = new Text(); + + // The values of from and to parameters from the previous evaluate() call + private Text lastFrom = null; + private Text lastTo = null; + private ObjectInspectorConverters.Converter[] converters; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + if (arguments.length != 3) { + throw new UDFArgumentLengthException( + "_FUNC_ expects exactly 3 arguments"); + } + + for (int i = 0; i < arguments.length; i++) { + PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) arguments[i]) + .getPrimitiveCategory(); + if ((arguments[i].getCategory() != Category.PRIMITIVE) + || (primitiveCategory != PrimitiveCategory.STRING && primitiveCategory != PrimitiveCategory.VOID)) { + throw new UDFArgumentTypeException(i, + "A string argument was expected but an argument of type " + + arguments[i].getTypeName() + " was given."); + } + } + + converters = new ObjectInspectorConverters.Converter[arguments.length]; + for (int i = 0; i < arguments.length; i++) { + converters[i] = ObjectInspectorConverters + .getConverter( + arguments[i], + PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } + + // We will be returning a Text object + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 3); + if (arguments[0].get() == null || arguments[1].get() == null + || arguments[2].get() == null) { + return null; + } + + Text input = (Text) converters[0].convert(arguments[0].get()); + Text from = (Text) converters[1].convert(arguments[1].get()); + Text to = (Text) converters[2].convert(arguments[2].get()); + + populateMappingsIfNecessary(from, to); + String resultString = processInput(input); + result.set(resultString); + return result; + } + + /** + * Pre-processes the from and to strings by calling + * {@link #populateMappings(Text, Text)} if necessary + * + * @param from + * from string to be used for translation + * @param to + * to string to be used for translation + */ + private void populateMappingsIfNecessary(Text from, Text to) { + // If the from and to strings haven't changed, we don't need to + // preprocess again to regenerate the mappings of code points that need + // to replaced or deleted + if ((lastFrom == null) || (lastTo == null) || !from.equals(lastFrom) + || !to.equals(lastTo)) { + populateMappings(from, to); + // These are null when evaluate() is called for the first time + if (lastFrom == null) { + lastFrom = new Text(); + } + if (lastTo == null) { + lastTo = new Text(); + } + // Need to deep copy here since doing something like lastFrom = from + // instead, will make lastFrom point to the same Text object which + // would make from.equals(lastFrom) always true + lastFrom.set(from); + lastTo.set(to); + } + } + + /** + * Pre-process the from and to strings populate {@link #replacementMap} and + * {@link #deletionSet} + * + * @param from + * from string to be used for translation + * @param to + * to string to be used for translation + */ + private void populateMappings(Text from, Text to) { + replacementMap.clear(); + deletionSet.clear(); + + ByteBuffer fromBytes = ByteBuffer.wrap(from.getBytes(), 0, + from.getLength()); + ByteBuffer toBytes = ByteBuffer.wrap(to.getBytes(), 0, to.getLength()); + + // Traverse through the from string, one code point at a time + while (fromBytes.hasRemaining()) { + // This will also move the iterator ahead by one code point + int fromCodePoint = Text.bytesToCodePoint(fromBytes); + // If the to string has more code points, make sure to traverse it + // too + if (toBytes.hasRemaining()) { + int toCodePoint = Text.bytesToCodePoint(toBytes); + // If the code point from from string already has a replacement + // or is to be deleted, we don't need to do anything, just move + // on to the next code point + if (replacementMap.containsKey(fromCodePoint) + || deletionSet.contains(fromCodePoint)) { + continue; + } + replacementMap.put(fromCodePoint, toCodePoint); + } else { + // If the code point from from string already has a replacement + // or is to be deleted, we don't need to do anything, just move + // on to the next code point + if (replacementMap.containsKey(fromCodePoint) + || deletionSet.contains(fromCodePoint)) { + continue; + } + deletionSet.add(fromCodePoint); + } + } + } + + /** + * Translates the input string based on {@link #replacementMap} and + * {@link #deletionSet} and returns the translated string + * + * @param input + * input string to perform the translation on + * @return translated string + */ + private String processInput(Text input) { + StringBuilder resultBuilder = new StringBuilder(); + // Obtain the byte buffer from the input string so we can traverse it + // code point by code point + ByteBuffer inputBytes = ByteBuffer.wrap(input.getBytes(), 0, + input.getLength()); + // Traverse the byte buffer containing the input string one code point + // at a time + while (inputBytes.hasRemaining()) { + int inputCodePoint = Text.bytesToCodePoint(inputBytes); + // If the code point exists in deletion set, no need to emit out + // anything for this code point. Continue on to the next code point + if (deletionSet.contains(inputCodePoint)) { + continue; + } + + Integer replacementCodePoint = replacementMap.get(inputCodePoint); + // If a replacement exists for this code point, emit out the + // replacement and append it to the output string. If no such + // replacement exists, emit out the original input code point + char[] charArray = Character + .toChars((replacementCodePoint != null) ? replacementCodePoint + : inputCodePoint); + resultBuilder.append(charArray); + } + String resultString = resultBuilder.toString(); + return resultString; + } + + @Override + public String getDisplayString(String[] children) { + assert (children.length == 3); + return "translate(" + children[0] + ", " + children[1] + ", " + + children[2] + ")"; + } + +}