Index: ql/src/test/results/clientpositive/show_functions.q.out =================================================================== --- ql/src/test/results/clientpositive/show_functions.q.out (revision 1345772) +++ ql/src/test/results/clientpositive/show_functions.q.out (working copy) @@ -147,6 +147,7 @@ tan to_date to_utc_timestamp +translate trim ucase unhex @@ -215,6 +216,7 @@ size space to_date +translate ucase variance xpath_double Index: ql/src/test/results/clientpositive/udf_translate.q.out =================================================================== --- ql/src/test/results/clientpositive/udf_translate.q.out (revision 0) +++ ql/src/test/results/clientpositive/udf_translate.q.out (revision 0) @@ -0,0 +1,172 @@ +PREHOOK: query: DESCRIBE FUNCTION translate +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION translate +POSTHOOK: type: DESCFUNCTION +translate(input, from, to) - translates the input string by replacing the characters present in the from string with the corresponding characters in the to string +PREHOOK: query: DESCRIBE FUNCTION EXTENDED translate +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION EXTENDED translate +POSTHOOK: type: DESCFUNCTION +translate(input, from, to) - translates the input string by replacing the characters present in the from string with the corresponding characters in the to string +translate(string input, string from, string to) is an equivalent function to translate in PostGreSQL. It works on a character by character basis on the input string (first parameter). A character in the input is checked for presence in the from string (second parameter). If a match happens, the character from to string (third parameter) which appears at the same index as the character in from string is obtained. This character is emitted in the output string instead of the original character from the input string. If the to string is shorter than the from string, there may not be a character present at the same index in the to string. In such a case, nothing is emitted for the original character and it's deleted from the output string. +For example, + +translate('abcdef', 'adc', '19') returns '1b9ef' replacing 'a' with '1', 'd' with '9' and removing 'c' from the input string + +translate('a b c d', ' ', '') return 'abcd' removing all spaces from the input string + +If the same character is present multiple times in the input string, the first occurence of the character is the one that's considered for matching. However, it is not recommended to have the same character more than once in the from string since it's not required and adds to confusion. + +For example, + +translate('abcdef', 'ada', '192') returns '1bc9ef' replaces 'a' with '1' and 'd' with '9' ignoring the second occurence of 'a' in the from string mapping it to '2' +PREHOOK: query: -- Create some tables to serve some input data +CREATE TABLE table_input(input STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Create some tables to serve some input data +CREATE TABLE table_input(input STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@table_input +PREHOOK: query: CREATE TABLE table_translate(input_string STRING, from_string STRING, to_string STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE table_translate(input_string STRING, from_string STRING, to_string STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@table_translate +PREHOOK: query: FROM src INSERT OVERWRITE TABLE table_input SELECT 'abcd' WHERE src.key = 86 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@table_input +POSTHOOK: query: FROM src INSERT OVERWRITE TABLE table_input SELECT 'abcd' WHERE src.key = 86 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@table_input +POSTHOOK: Lineage: table_input.input SIMPLE [] +PREHOOK: query: FROM src INSERT OVERWRITE TABLE table_translate SELECT 'abcd', 'ahd', '12' WHERE src.key = 86 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@table_translate +POSTHOOK: query: FROM src INSERT OVERWRITE TABLE table_translate SELECT 'abcd', 'ahd', '12' WHERE src.key = 86 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@table_translate +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +PREHOOK: query: -- Run some queries on constant input parameters +SELECT translate('abcd', 'ab', '12'), + translate('abcd', 'abc', '12') FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: -- Run some queries on constant input parameters +SELECT translate('abcd', 'ab', '12'), + translate('abcd', 'abc', '12') FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +12cd 12d +PREHOOK: query: -- Run some queries where first parameter being a table column while the other two being constants +SELECT translate(table_input.input, 'ab', '12'), + translate(table_input.input, 'abc', '12') FROM table_input LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@table_input +#### A masked pattern was here #### +POSTHOOK: query: -- Run some queries where first parameter being a table column while the other two being constants +SELECT translate(table_input.input, 'ab', '12'), + translate(table_input.input, 'abc', '12') FROM table_input LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@table_input +#### A masked pattern was here #### +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +12cd 12d +PREHOOK: query: -- Run some queries where all parameters are coming from table columns +SELECT translate(input_string, from_string, to_string) FROM table_translate LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@table_translate +#### A masked pattern was here #### +POSTHOOK: query: -- Run some queries where all parameters are coming from table columns +SELECT translate(input_string, from_string, to_string) FROM table_translate LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@table_translate +#### A masked pattern was here #### +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +1bc +PREHOOK: query: -- Run some queries where some parameters are NULL +SELECT translate(NULL, 'ab', '12'), + translate('abcd', NULL, '12'), + translate('abcd', 'ab', NULL), + translate(NULL, NULL, NULL) FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: -- Run some queries where some parameters are NULL +SELECT translate(NULL, 'ab', '12'), + translate('abcd', NULL, '12'), + translate('abcd', 'ab', NULL), + translate(NULL, NULL, NULL) FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +NULL NULL NULL NULL +PREHOOK: query: -- Run some queries where the same character appears several times in the from string (2nd argument) of the UDF +SELECT translate('abcd', 'aba', '123'), + translate('abcd', 'aba', '12') FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: -- Run some queries where the same character appears several times in the from string (2nd argument) of the UDF +SELECT translate('abcd', 'aba', '123'), + translate('abcd', 'aba', '12') FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +12cd 12cd +PREHOOK: query: -- Run some queries for the ignorant case when the 3rd parameter has more characters than the second one +SELECT translate('abcd', 'abc', '1234') FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: -- Run some queries for the ignorant case when the 3rd parameter has more characters than the second one +SELECT translate('abcd', 'abc', '1234') FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +123d +PREHOOK: query: -- Test proper function over UTF-8 characters +SELECT translate('Àbcd', 'À', 'Ã') FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: -- Test proper function over UTF-8 characters +SELECT translate('Àbcd', 'À', 'Ã') FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: Lineage: table_input.input SIMPLE [] +POSTHOOK: Lineage: table_translate.from_string SIMPLE [] +POSTHOOK: Lineage: table_translate.input_string SIMPLE [] +POSTHOOK: Lineage: table_translate.to_string SIMPLE [] +Ãbcd Index: ql/src/test/queries/clientnegative/udf_translate_wrong1.q =================================================================== --- ql/src/test/queries/clientnegative/udf_translate_wrong1.q (revision 0) +++ ql/src/test/queries/clientnegative/udf_translate_wrong1.q (revision 0) @@ -0,0 +1,2 @@ +-- Invalid number of arguments +SELECT translate('-') FROM src LIMIT 1; Index: ql/src/test/queries/clientnegative/udf_translate_wrong2.q =================================================================== --- ql/src/test/queries/clientnegative/udf_translate_wrong2.q (revision 0) +++ ql/src/test/queries/clientnegative/udf_translate_wrong2.q (revision 0) @@ -0,0 +1,2 @@ +-- Invalid type of argument(integer instead of string) +SELECT translate('abcd', 1, 2) FROM src LIMIT 1; Index: ql/src/test/queries/clientnegative/udf_translate_wrong3.q =================================================================== --- ql/src/test/queries/clientnegative/udf_translate_wrong3.q (revision 0) +++ ql/src/test/queries/clientnegative/udf_translate_wrong3.q (revision 0) @@ -0,0 +1,2 @@ +-- Invalid type of argument(array instead of string) +SELECT translate(array('first', 'second', 'third'), 'abcd', '1234') FROM src LIMIT 1; Index: ql/src/test/queries/clientpositive/udf_translate.q =================================================================== --- ql/src/test/queries/clientpositive/udf_translate.q (revision 0) +++ ql/src/test/queries/clientpositive/udf_translate.q (revision 0) @@ -0,0 +1,37 @@ +DESCRIBE FUNCTION translate; +DESCRIBE FUNCTION EXTENDED translate; + +-- Create some tables to serve some input data +CREATE TABLE table_input(input STRING); +CREATE TABLE table_translate(input_string STRING, from_string STRING, to_string STRING); + +FROM src INSERT OVERWRITE TABLE table_input SELECT 'abcd' WHERE src.key = 86; +FROM src INSERT OVERWRITE TABLE table_translate SELECT 'abcd', 'ahd', '12' WHERE src.key = 86; + +-- Run some queries on constant input parameters +SELECT translate('abcd', 'ab', '12'), + translate('abcd', 'abc', '12') FROM src LIMIT 1; + +-- Run some queries where first parameter being a table column while the other two being constants +SELECT translate(table_input.input, 'ab', '12'), + translate(table_input.input, 'abc', '12') FROM table_input LIMIT 1; + +-- Run some queries where all parameters are coming from table columns +SELECT translate(input_string, from_string, to_string) FROM table_translate LIMIT 1; + +-- Run some queries where some parameters are NULL +SELECT translate(NULL, 'ab', '12'), + translate('abcd', NULL, '12'), + translate('abcd', 'ab', NULL), + translate(NULL, NULL, NULL) FROM src LIMIT 1; + +-- Run some queries where the same character appears several times in the from string (2nd argument) of the UDF +SELECT translate('abcd', 'aba', '123'), + translate('abcd', 'aba', '12') FROM src LIMIT 1; + +-- Run some queries for the ignorant case when the 3rd parameter has more characters than the second one +SELECT translate('abcd', 'abc', '1234') FROM src LIMIT 1; + +-- Test proper function over UTF-8 characters +SELECT translate('Àbcd', 'À', 'Ã') FROM src LIMIT 1; + Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 1345772) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -197,6 +197,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTranslate; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnion; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFWhen; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; @@ -305,6 +306,7 @@ registerUDF("parse_url", UDFParseUrl.class, false); registerGenericUDF("split", GenericUDFSplit.class); registerGenericUDF("str_to_map", GenericUDFStringToMap.class); + registerGenericUDF("translate", GenericUDFTranslate.class); registerUDF("positive", UDFOPPositive.class, true, "+"); registerUDF("negative", UDFOPNegative.class, true, "-"); Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTranslate.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTranslate.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTranslate.java (revision 0) @@ -0,0 +1,291 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; + +/** + * TRANSLATE(string input, string from, string to) is an equivalent function to translate in + * PostGresSQL. See explain extended annotation below to read more about how this UDF works + * + */ +@UDFType(deterministic = true) +//@formatter:off +@Description( + name = "translate", + value = "_FUNC_(input, from, to) - translates the input string by" + + " replacing the characters present in the from string with the" + + " corresponding characters in the to string", + extended = "_FUNC_(string input, string from, string to) is an" + + " equivalent function to translate in PostGreSQL. It works" + + " on a character by character basis on the input string (first" + + " parameter). A character in the input is checked for" + + " presence in the from string (second parameter). If a" + + " match happens, the character from to string (third " + + "parameter) which appears at the same index as the character" + + " in from string is obtained. This character is emitted in" + + " the output string instead of the original character from" + + " the input string. If the to string is shorter than the" + + " from string, there may not be a character present at" + + " the same index in the to string. In such a case, nothing is" + + " emitted for the original character and it's deleted from" + + " the output string." + + "\n" + + "For example," + + "\n" + + "\n" + + "_FUNC_('abcdef', 'adc', '19') returns '1b9ef' replacing" + + " 'a' with '1', 'd' with '9' and removing 'c' from the input" + + " string" + + "\n" + + "\n" + + "_FUNC_('a b c d', ' ', '') return 'abcd'" + + " removing all spaces from the input string" + + "\n" + + "\n" + + "If the same character is present multiple times in the" + + " input string, the first occurence of the character is the" + + " one that's considered for matching. However, it is not recommended" + + " to have the same character more than once in the from" + + " string since it's not required and adds to confusion." + + "\n" + + "\n" + + "For example," + + "\n" + + "\n" + + "_FUNC_('abcdef', 'ada', '192') returns '1bc9ef' replaces" + + " 'a' with '1' and 'd' with '9' ignoring the second" + + " occurence of 'a' in the from string mapping it to '2'" +) +//@formatter:on +public class GenericUDFTranslate extends GenericUDF { + + // For all practical purposes a code point is a fancy name for character. A java char data type + // can store characters that require 16 bits or less. However, the unicode specification has + // changed to allow for characters whose representation requires more than 16 bits. Therefore we + // need to represent each character (called a code point from hereon) as int. More details at + // http://docs.oracle.com/javase/7/docs/api/java/lang/Character.html + + /** + * If a code point needs to be replaced with another code point, this map with store the mapping. + */ + private Map replacementMap = new HashMap(); + + /** + * This set stores all the code points which needed to be deleted from the input string. The + * objects in deletionSet and keys in replacementMap are mutually exclusive + */ + private Set deletionSet = new HashSet(); + /** + * A placeholder for result. + */ + private Text result = new Text(); + + /** + * The values of from parameter from the previous evaluate() call. + */ + private Text lastFrom = null; + /** + * The values of to parameter from the previous evaluate() call. + */ + private Text lastTo = null; + /** + * Converters for retrieving the arguments to the UDF. + */ + private ObjectInspectorConverters.Converter[] converters; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 3) { + throw new UDFArgumentLengthException("_FUNC_ expects exactly 3 arguments"); + } + + for (int i = 0; i < arguments.length; i++) { + if (arguments[i].getCategory() != Category.PRIMITIVE) { + throw new UDFArgumentTypeException(i, + "A string argument was expected but an argument of type " + arguments[i].getTypeName() + + " was given."); + + } + + // Now that we have made sure that the argument is of primitive type, we can get the primitive + // category + PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) arguments[i]) + .getPrimitiveCategory(); + + if (primitiveCategory != PrimitiveCategory.STRING + && primitiveCategory != PrimitiveCategory.VOID) { + throw new UDFArgumentTypeException(i, + "A string argument was expected but an argument of type " + arguments[i].getTypeName() + + " was given."); + + } + } + + converters = new ObjectInspectorConverters.Converter[arguments.length]; + for (int i = 0; i < arguments.length; i++) { + converters[i] = ObjectInspectorConverters.getConverter(arguments[i], + PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } + + // We will be returning a Text object + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length == 3); + if (arguments[0].get() == null || arguments[1].get() == null || arguments[2].get() == null) { + return null; + } + + Text input = (Text) converters[0].convert(arguments[0].get()); + Text from = (Text) converters[1].convert(arguments[1].get()); + Text to = (Text) converters[2].convert(arguments[2].get()); + + populateMappingsIfNecessary(from, to); + String resultString = processInput(input); + result.set(resultString); + return result; + } + + /** + * Pre-processes the from and to strings by calling {@link #populateMappings(Text, Text)} if + * necessary. + * + * @param from + * from string to be used for translation + * @param to + * to string to be used for translation + */ + private void populateMappingsIfNecessary(Text from, Text to) { + // If the from and to strings haven't changed, we don't need to preprocess again to regenerate + // the mappings of code points that need to replaced or deleted + if ((lastFrom == null) || (lastTo == null) || !from.equals(lastFrom) || !to.equals(lastTo)) { + populateMappings(from, to); + // These are null when evaluate() is called for the first time + if (lastFrom == null) { + lastFrom = new Text(); + } + if (lastTo == null) { + lastTo = new Text(); + } + // Need to deep copy here since doing something like lastFrom = from instead, will make + // lastFrom point to the same Text object which would make from.equals(lastFrom) always true + lastFrom.set(from); + lastTo.set(to); + } + } + + /** + * Pre-process the from and to strings populate {@link #replacementMap} and {@link #deletionSet}. + * + * @param from + * from string to be used for translation + * @param to + * to string to be used for translation + */ + private void populateMappings(Text from, Text to) { + replacementMap.clear(); + deletionSet.clear(); + + ByteBuffer fromBytes = ByteBuffer.wrap(from.getBytes(), 0, from.getLength()); + ByteBuffer toBytes = ByteBuffer.wrap(to.getBytes(), 0, to.getLength()); + + // Traverse through the from string, one code point at a time + while (fromBytes.hasRemaining()) { + // This will also move the iterator ahead by one code point + int fromCodePoint = Text.bytesToCodePoint(fromBytes); + // If the to string has more code points, make sure to traverse it too + if (toBytes.hasRemaining()) { + int toCodePoint = Text.bytesToCodePoint(toBytes); + // If the code point from from string already has a replacement or is to be deleted, we + // don't need to do anything, just move on to the next code point + if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) { + continue; + } + replacementMap.put(fromCodePoint, toCodePoint); + } else { + // If the code point from from string already has a replacement or is to be deleted, we + // don't need to do anything, just move on to the next code point + if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) { + continue; + } + deletionSet.add(fromCodePoint); + } + } + } + + /** + * Translates the input string based on {@link #replacementMap} and {@link #deletionSet} and + * returns the translated string. + * + * @param input + * input string to perform the translation on + * @return translated string + */ + private String processInput(Text input) { + StringBuilder resultBuilder = new StringBuilder(); + // Obtain the byte buffer from the input string so we can traverse it code point by code point + ByteBuffer inputBytes = ByteBuffer.wrap(input.getBytes(), 0, input.getLength()); + // Traverse the byte buffer containing the input string one code point at a time + while (inputBytes.hasRemaining()) { + int inputCodePoint = Text.bytesToCodePoint(inputBytes); + // If the code point exists in deletion set, no need to emit out anything for this code point. + // Continue on to the next code point + if (deletionSet.contains(inputCodePoint)) { + continue; + } + + Integer replacementCodePoint = replacementMap.get(inputCodePoint); + // If a replacement exists for this code point, emit out the replacement and append it to the + // output string. If no such replacement exists, emit out the original input code point + char[] charArray = Character.toChars((replacementCodePoint != null) ? replacementCodePoint + : inputCodePoint); + resultBuilder.append(charArray); + } + String resultString = resultBuilder.toString(); + return resultString; + } + + @Override + public String getDisplayString(String[] children) { + assert (children.length == 3); + return "translate(" + children[0] + ", " + children[1] + ", " + children[2] + ")"; + } + +}