Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 1446935) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -191,6 +191,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFPrintf; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFRefinedSoundex; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFReflect; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSentences; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSize; @@ -200,8 +201,8 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDecimal; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDecimal; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFTranslate; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnion; @@ -314,6 +315,7 @@ registerGenericUDF("split", GenericUDFSplit.class); registerGenericUDF("str_to_map", GenericUDFStringToMap.class); registerGenericUDF("translate", GenericUDFTranslate.class); + registerGenericUDF("soundex_ref", GenericUDFRefinedSoundex.class); registerUDF("positive", UDFOPPositive.class, true, "+"); registerUDF("negative", UDFOPNegative.class, true, "-"); @@ -1031,36 +1033,36 @@ } if (udfMethods.size() > 1) { - // if the only difference is numeric types, pick the method + // if the only difference is numeric types, pick the method // with the smallest overall numeric type. int lowestNumericType = Integer.MAX_VALUE; boolean multiple = true; Method candidate = null; List referenceArguments = null; - + for (Method m: udfMethods) { int maxNumericType = 0; - + List argumentsAccepted = TypeInfoUtils.getParameterTypeInfos(m, argumentsPassed.size()); - + if (referenceArguments == null) { - // keep the arguments for reference - we want all the non-numeric + // keep the arguments for reference - we want all the non-numeric // arguments to be the same referenceArguments = argumentsAccepted; } - + Iterator referenceIterator = referenceArguments.iterator(); - + for (TypeInfo accepted: argumentsAccepted) { TypeInfo reference = referenceIterator.next(); - + if (numericTypes.containsKey(accepted)) { // We're looking for the udf with the smallest maximum numeric type. int typeValue = numericTypes.get(accepted); maxNumericType = typeValue > maxNumericType ? typeValue : maxNumericType; } else if (!accepted.equals(reference)) { // There are non-numeric arguments that don't match from one UDF to - // another. We give up at this point. + // another. We give up at this point. throw new AmbiguousMethodException(udfClass, argumentsPassed, mlist); } } Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRefinedSoundex.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRefinedSoundex.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRefinedSoundex.java (revision 0) @@ -0,0 +1,172 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.io.Text; + +/* + * GenericUDF for phonetic algorithm soundex + * B, P => 1 + * F, V => 2 + * C, K, S => 3 + * G, J => 4 + * Q, X, Z => 5 + * D, T => 6 + * L => 7 + * M, N => 8 + * R => 9 + * Other letters => 0 + * Repetitions are replaced with single character + */ +@Description(name = "soundex_ref", + value = "_FUNC_(string) - " + + "returns the string value after applying refined soundex algorithm", + extended = "Example:\n" + + " > SELECT _FUNC_('Carren') FROM src LIMIT 1;\n" + + " 'C30908'") +public class GenericUDFRefinedSoundex extends GenericUDF { + + private ObjectInspector[] argumentOIs; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + + if (arguments.length != 1) { + throw new UDFArgumentLengthException( + "The function soundex_ref(string) " + + "accepts only one string argument"); + } + + // Primitive string argument is the only acceptable type + switch(arguments[0].getCategory()) { + case PRIMITIVE: + if (arguments[0].getTypeName().equals(serdeConstants.STRING_TYPE_NAME) + || arguments[0].getTypeName().equals(serdeConstants.VOID_TYPE_NAME) + ) { + break; + } + default: + throw new UDFArgumentTypeException(0, "Argument " + + " of function soundex_ref must be \"" + serdeConstants.STRING_TYPE_NAME + + ">\", but \"" + arguments[0].getTypeName() + "\" was found."); + } + + argumentOIs = arguments; + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + } + + private final Text resultText = new Text(); + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + + // Null input value should return Null + if (arguments[0].get() == null) { + return null; + } + + String argument = (((StringObjectInspector) argumentOIs[0]).getPrimitiveJavaObject(arguments[0].get())).toUpperCase(); + StringBuilder sb = new StringBuilder(); + String currDrvdVal = new String(); + + /* + * For each character, covert it to appropriate value + * as defined in refined soundex algorithm + */ + for(int i = 0; i < argument.length(); i++) { + if(i == 0) { + sb.append(argument.charAt(i)); + } + + if (i > 0 && (argument.charAt(i) == argument.charAt(i-1))) { + continue; + } + else { + switch (argument.charAt(i)) { + case 'B': + case 'P': + currDrvdVal = String.valueOf('1'); + break; + case 'F': + case 'V': + currDrvdVal = String.valueOf('2'); + break; + case 'C': + case 'K': + case 'S': + currDrvdVal = String.valueOf('3'); + break; + case 'G': + case 'J': + currDrvdVal = String.valueOf('4'); + break; + case 'Q': + case 'X': + case 'Z': + currDrvdVal = String.valueOf('5'); + break; + case 'D': + case 'T': + currDrvdVal = String.valueOf('6'); + break; + case 'L': + currDrvdVal = String.valueOf('7'); + break; + case 'M': + case 'N': + currDrvdVal = String.valueOf('8'); + break; + case 'R': + currDrvdVal = String.valueOf('9'); + break; + default: + currDrvdVal = String.valueOf('0'); + } + } + + /* + * If the derived value in current iteration is same as + * the derived value in previous iteration, ignore + * else append + */ + if (currDrvdVal.charAt(0) != (sb.substring(sb.length()-1)).charAt(0)) { + sb.append(currDrvdVal); + } + + } // For loop end + + resultText.set(sb.toString()); + return resultText; + } + + @Override + public String getDisplayString(String[] children) { + assert (children.length == 1); + return "soundex_ref(" + children[0] + ")"; + } +}