diff --git a/LICENSE b/LICENSE index 3e7dc6b98c..316afc629b 100644 --- a/LICENSE +++ b/LICENSE @@ -404,4 +404,34 @@ products or services of Licensee, or any third party. agrees to be bound by the terms and conditions of this License Agreement. +For google re2j (https://github.com/google/re2j/blob/master/LICENSE): + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 148cf7e3d6..67e66cc926 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -3880,6 +3880,7 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Time to wait to finish prewarming spark executors"), HIVESTAGEIDREARRANGE("hive.stageid.rearrange", "none", new StringSet("none", "idonly", "traverse", "execution"), ""), HIVEEXPLAINDEPENDENCYAPPENDTASKTYPES("hive.explain.dependency.append.tasktype", false, ""), + HIVEUSEGOOGLEREGEXENGINE("hive.use.googleregex.engine",false,"whether to use google regex engine or not, default regex engine is java.util.regex"), HIVECOUNTERGROUP("hive.counters.group.name", "HIVE", "The name of counter group for internal Hive variables (CREATED_FILE, FATAL_ERROR, etc.)"), diff --git a/pom.xml b/pom.xml index 330522dd45..8858edb883 100644 --- a/pom.xml +++ b/pom.xml @@ -219,6 +219,7 @@ 3.0.0 0.6.0 2.2.4 + 1.2 2.0.1 @@ -920,6 +921,11 @@ snappy-java ${snappy.version} + + com.google.re2j + re2j + ${re2j.version} + diff --git a/ql/pom.xml b/ql/pom.xml index 7c4d26f512..d2fe8f5652 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -756,6 +756,11 @@ ${guava.version} test + + com.google.re2j + re2j + ${re2j.version} + @@ -978,6 +983,7 @@ org.apache.orc:orc-shims org.apache.orc:orc-tools joda-time:joda-time + com.google.re2j:re2j diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java index d309c37cc1..3bf3cfd3d9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java @@ -23,6 +23,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.MapredContext; +import org.apache.hadoop.hive.ql.session.SessionState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.Description; @@ -36,7 +39,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.BooleanWritable; - /** * UDF to extract a specific group identified by a java regex. Note that if a * regexp has a backslash ('\'), then need to specify '\\' For example, @@ -54,11 +56,28 @@ private final BooleanWritable output = new BooleanWritable(); private transient boolean isRegexConst; private transient String regexConst; - private transient Pattern patternConst; private transient boolean warned; + private transient java.util.regex.Pattern patternConst; + private transient com.google.re2j.Pattern patternConstR2j; + private boolean useGoogleRegexEngine=false; + + @Override + public void configure(MapredContext context) { + if (context != null) { + if(context.getJobConf().get("hive.use.googleregex.engine").equals("true")){ + this.useGoogleRegexEngine=true; + } + } + + } @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + SessionState ss = SessionState.get(); + if (ss != null) { + this.useGoogleRegexEngine = ss.getConf().getBoolVar(HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE); + } + checkArgsSize(arguments, 2, 2); checkArgPrimitive(arguments, 0); @@ -73,7 +92,12 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumen if (arguments[1] instanceof ConstantObjectInspector) { regexConst = getConstantStringValue(arguments, 1); if (regexConst != null) { - patternConst = Pattern.compile(regexConst); + if(!useGoogleRegexEngine){ + //if(!HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE)){ + patternConst = Pattern.compile(regexConst); + }else{ + patternConstR2j = com.google.re2j.Pattern.compile(regexConst); + } } isRegexConst = true; } @@ -109,16 +133,29 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { return output; } - Pattern p; - if (isRegexConst) { - p = patternConst; - } else { - p = Pattern.compile(regex); - } + if(!useGoogleRegexEngine){ + Pattern p; + if (isRegexConst) { + p = patternConst; + } else { + p = Pattern.compile(regex); + } - Matcher m = p.matcher(s); - output.set(m.find(0)); - return output; + Matcher m = p.matcher(s); + output.set(m.find(0)); + return output; + }else{ + com.google.re2j.Pattern patternR2j; + if (isRegexConst) { + patternR2j = patternConstR2j; + } else { + patternR2j = com.google.re2j.Pattern.compile(regex); + } + + com.google.re2j.Matcher m = patternR2j.matcher(s); + output.set(m.find(0)); + return output; + } } @Override