Index: ql/src/test/results/clientpositive/show_functions.q.out =================================================================== --- ql/src/test/results/clientpositive/show_functions.q.out (revision 960987) +++ ql/src/test/results/clientpositive/show_functions.q.out (working copy) @@ -107,6 +107,7 @@ rpad rtrim second +sentences sign sin size Index: ql/src/test/results/clientpositive/udf_sentences.q.out =================================================================== --- ql/src/test/results/clientpositive/udf_sentences.q.out (revision 0) +++ ql/src/test/results/clientpositive/udf_sentences.q.out (revision 0) @@ -0,0 +1,27 @@ +PREHOOK: query: SELECT sentences("Hive est un excellent outil pour les requêtes de données, et peut-être plus polyvalent que la traduction automatique! la ponctuation multiples, des phrases mal formées ... confusion - et pourtant ce UDF fonctionne encore!", "fr") FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-06_13-32-01_693_421966477742827148/10000 +POSTHOOK: query: SELECT sentences("Hive est un excellent outil pour les requêtes de données, et peut-être plus polyvalent que la traduction automatique! la ponctuation multiples, des phrases mal formées ... confusion - et pourtant ce UDF fonctionne encore!", "fr") FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-06_13-32-01_693_421966477742827148/10000 +[["Hive","est","un","excellent","outil","pour","les","requêtes","de","donnÃ","es","et","peut-être","plus","polyvalent","que","la","traduction","automatique"],["la","ponctuation","multiples","des","phrases","mal","formÃ","es","confusion","et","pourtant","ce","UDF","fonctionne","encore"]] +PREHOOK: query: SELECT sentences("Hive ist ein ausgezeichnetes Werkzeug für die Abfrage von Daten, und vielleicht vielseitiger als die maschinelle Übersetzung! Mehrfache, schlecht gebildeten Sätze ... verwirrende Interpunktion - und doch ist diese UDF funktioniert immer noch!", "gr") FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-06_13-32-05_438_6757668800062242171/10000 +POSTHOOK: query: SELECT sentences("Hive ist ein ausgezeichnetes Werkzeug für die Abfrage von Daten, und vielleicht vielseitiger als die maschinelle Übersetzung! Mehrfache, schlecht gebildeten Sätze ... verwirrende Interpunktion - und doch ist diese UDF funktioniert immer noch!", "gr") FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-06_13-32-05_438_6757668800062242171/10000 +[["Hive","ist","ein","ausgezeichnetes","Werkzeug","für","die","Abfrage","von","Daten","und","vielleicht","vielseitiger","als","die","maschinelle","Ã","bersetzung"],["Mehrfache","schlecht","gebildeten","SÃ","tze","verwirrende","Interpunktion","und","doch","ist","diese","UDF","funktioniert","immer","noch"]] +PREHOOK: query: SELECT sentences("Hive is an excellent tool for data querying; and perhaps more versatile than machine translation!! Multiple, ill-formed sentences...confounding punctuation--and yet this UDF still works!!!!") FROM src LIMIT 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-06_13-32-08_772_9110038054104130682/10000 +POSTHOOK: query: SELECT sentences("Hive is an excellent tool for data querying; and perhaps more versatile than machine translation!! Multiple, ill-formed sentences...confounding punctuation--and yet this UDF still works!!!!") FROM src LIMIT 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-06_13-32-08_772_9110038054104130682/10000 +[["Hive","is","an","excellent","tool","for","data","querying","and","perhaps","more","versatile","than","machine","translation"],["Multiple","ill-formed","sentences","confounding","punctuation","and","yet","this","UDF","still","works"]] Index: ql/src/test/queries/clientpositive/udf_sentences.q =================================================================== --- ql/src/test/queries/clientpositive/udf_sentences.q (revision 0) +++ ql/src/test/queries/clientpositive/udf_sentences.q (revision 0) @@ -0,0 +1,3 @@ +SELECT sentences("Hive est un excellent outil pour les requêtes de données, et peut-être plus polyvalent que la traduction automatique! la ponctuation multiples, des phrases mal formées ... confusion - et pourtant ce UDF fonctionne encore!", "fr") FROM src LIMIT 1; +SELECT sentences("Hive ist ein ausgezeichnetes Werkzeug für die Abfrage von Daten, und vielleicht vielseitiger als die maschinelle Übersetzung! Mehrfache, schlecht gebildeten Sätze ... verwirrende Interpunktion - und doch ist diese UDF funktioniert immer noch!", "gr") FROM src LIMIT 1; +SELECT sentences("Hive is an excellent tool for data querying\; and perhaps more versatile than machine translation!! Multiple, ill-formed sentences...confounding punctuation--and yet this UDF still works!!!!") FROM src LIMIT 1; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 960987) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -162,6 +162,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFMap; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSentences; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSize; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFSplit; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; @@ -380,6 +381,7 @@ registerGenericUDF("elt", GenericUDFElt.class); registerGenericUDF("concat_ws", GenericUDFConcatWS.class); registerGenericUDF("array_contains", GenericUDFArrayContains.class); + registerGenericUDF("sentences", GenericUDFSentences.class); // Generic UDTF's registerGenericUDTF("explode", GenericUDTFExplode.class); Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSentences.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSentences.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFSentences.java (revision 0) @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import java.util.ArrayList; +import java.util.Locale; +import java.text.BreakIterator; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; + +/** + * GenericUDFSentences: splits a natural language chunk of text into sentences and words. + * + */ +@Description(name = "sentences", value = "_FUNC_(str, lang, country) - Splits str" + + " into arrays of sentences, where each sentence is an array of words. The 'lang' and" + + "'country' arguments are optional, and if omitted, the default locale is used.", + extended = "Example:\n" + + " > SELECT _FUNC_('Hello there! I am a UDF.') FROM src LIMIT 1;\n" + + " [ [\"Hello\", \"there\"], [\"I\", \"am\", \"a\", \"UDF\"] ]\n" + + " > SELECT _FUNC_(review, language) FROM movies;\n" + + "Unnecessary punctuation, such as periods and commas in English, is automatically stripped." + + " If specified, 'lang' should be a two-letter ISO-639 language code (such as 'en'), and " + + "'country' should be a two-letter ISO-3166 code (such as 'us'). Not all country and " + + "language codes are fully supported, and if an unsupported code is specified, a default " + + "locale is used to process that string.") +public class GenericUDFSentences extends GenericUDF { + private ObjectInspectorConverters.Converter[] converters; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length < 1 || arguments.length > 3) { + throw new UDFArgumentLengthException( + "The function sentences takes between 1 and 3 arguments."); + } + + converters = new ObjectInspectorConverters.Converter[arguments.length]; + for (int i = 0; i < arguments.length; i++) { + converters[i] = ObjectInspectorConverters.getConverter(arguments[i], + PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } + + return ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector)); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert (arguments.length >= 1 && arguments.length <= 3); + if (arguments[0].get() == null) { + return null; + } + + // if there is more than 1 argument specified, a different natural language + // locale is being specified + Locale locale = null; + if(arguments.length > 1 && arguments[1].get() != null) { + Text language = (Text) converters[1].convert(arguments[1].get()); + Text country = null; + if(arguments.length > 2 && arguments[2].get() != null) { + country = (Text) converters[2].convert(arguments[2].get()); + } + if(country != null) { + locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase()); + } else { + locale = new Locale(language.toString().toLowerCase()); + } + } else { + locale = Locale.getDefault(); + } + + // get the input and prepare the output + Text chunk = (Text) converters[0].convert(arguments[0].get()); + String text = chunk.toString(); + ArrayList > result = new ArrayList >(); + + // Parse out sentences using Java's text-handling API + BreakIterator bi = BreakIterator.getSentenceInstance(locale); + bi.setText(text); + int idx = 0; + while(bi.next() != BreakIterator.DONE) { + String sentence = text.substring(idx, bi.current()); + idx = bi.current(); + result.add(new ArrayList()); + + // Parse out words in the sentence + BreakIterator wi = BreakIterator.getWordInstance(locale); + wi.setText(sentence); + int widx = 0; + ArrayList sent_array = result.get(result.size()-1); + while(wi.next() != BreakIterator.DONE) { + String word = sentence.substring(widx, wi.current()); + widx = wi.current(); + if(Character.isLetterOrDigit(word.charAt(0))) { + sent_array.add(new Text(word)); + } + } + } + + return result; + } + + @Override + public String getDisplayString(String[] children) { + assert (children.length >= 1 && children.length <= 3); + String display = "sentences(" + children[0]; + if(children.length > 1) { + display += ", " + children[1]; + if(children.length > 2) { + display += ", " + children[2]; + } + } + display += ")"; + return display; + } +}