diff --git ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java index 7e888bc..c9260ea 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java @@ -27,13 +27,17 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.udf.UDFLike; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; @@ -48,7 +52,9 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -358,7 +364,19 @@ private void parse(ExprNodeDesc expression) { // get the kind of expression ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) expression; - Class op = expr.getGenericUDF().getClass(); + GenericUDF udf = expr.getGenericUDF(); + Class op = udf.getClass(); + if (op == GenericUDFBridge.class) { + op = ((GenericUDFBridge) udf).getUdfClass(); + if (op == UDFLike.class) { + // convert a LIKE pattern into a SARGable expression + expr = UDFLike.searchArgument(expr); + if (expr != null) { + op = expr.getGenericUDF().getClass(); + assert op == GenericUDFOPEqual.class || op == GenericUDFOPGreaterThan.class : "LIKE is always >= or ="; + } + } + } // handle the logical operators if (op == GenericUDFOPOr.class) { @@ -403,9 +421,8 @@ private void parse(ExprNodeDesc expression) { builder.startNot(); createLeaf(PredicateLeaf.Operator.IS_NULL, expr, 0); builder.end(); - - // otherwise, we didn't understand it, so mark it maybe } else { + // otherwise, we didn't understand it, so mark it maybe builder.literal(SearchArgument.TruthValue.YES_NO_NULL); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java index e291a48..b9e6b7c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java @@ -32,9 +32,13 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping; import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -598,4 +602,24 @@ public static ExprNodeDesc flattenExpr(ExprNodeDesc source) { // constant or null expr, just return return source; } + + /** + * Convert an expr node desc into its constant String representation. + * Returns null if the expr is not a constant or if it is a non-string type. + * @param expr + * @return constant string expression or NULL + */ + public static String toConstantString(ExprNodeDesc expr) { + TypeInfo constType = expr.getTypeInfo(); + if (constType.getCategory() == Category.PRIMITIVE + && PrimitiveObjectInspectorUtils.getPrimitiveGrouping(((PrimitiveTypeInfo) constType) + .getPrimitiveCategory()) == PrimitiveGrouping.STRING_GROUP) { + Object constValue = + ((ConstantObjectInspector) expr.getWritableObjectInspector()) + .getWritableConstantValue(); + if (constValue != null) { + } + } + return null; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java index 85d0363..8c008b7 100755 --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java @@ -18,13 +18,28 @@ package org.apache.hadoop.hive.ql.udf; +import java.util.Arrays; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import javax.annotation.Nullable; + import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.Text; @@ -202,6 +217,9 @@ public BooleanWritable evaluate(Text s, Text likePattern) { case BEGIN: endS = simplePattern.getLength(); break; + case MIDDLE: + // find() always looks in the middle mode. + break; case END: startS = endS - simplePattern.getLength(); break; @@ -217,4 +235,73 @@ public BooleanWritable evaluate(Text s, Text likePattern) { return result; } + /** + * Parses the likePattern. Based on it is a simple pattern or not, return as much of the simple + * pattern as possible as a minimum necessary prefix. + * + * "ab%" will be "ab" + * "a_b" will be "a" + * "abc%q%q_" will be "abc" + * "%abc" will be null + * null will be null + * + */ + private static String minPrefixPattern(@Nullable String likePattern) { + if (likePattern == null) { + return null; + } + int length = likePattern.length(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < length; i++) { + // Make a special case for "\\_" and "\\%" + char n = likePattern.charAt(i); + if (n == '\\' + && i + 1 < likePattern.length() + && (likePattern.charAt(i + 1) == '_' || likePattern.charAt(i + 1) == '%')) { + sb.append(likePattern.charAt(i + 1)); + i++; + continue; + } + if (n == '_' || n == '%') { + // wildcard, end prefix + break; + } else { + sb.append(n); + } + } + if (sb.length() > 0) { + return sb.toString(); + } + return null; + } + + public static ExprNodeGenericFuncDesc searchArgument(ExprNodeGenericFuncDesc expr) { + GenericUDF udf = expr.getGenericUDF(); + boolean isBridge = (udf instanceof GenericUDFBridge); + + if (isBridge == false || ((GenericUDFBridge) udf).getUdfClass() != UDFLike.class) { + // is not a bridge UDF or the bridged UDF isn't UDFLike + throw new IllegalArgumentException("The argument is not of the right UDF: needs UDFLike"); + } + + List args = expr.getChildren(); + if (args.size() != 2) { + return null; + } + ExprNodeDesc column = args.get(0); + ExprNodeDesc pattern = args.get(1); + if (column instanceof ExprNodeColumnDesc && pattern instanceof ExprNodeConstantDesc) { + String minPrefix = minPrefixPattern(ExprNodeDescUtils.toConstantString(pattern)); + PrimitiveGrouping colGroup = TypeInfoUtils.getPrimitiveGrouping(column.getTypeInfo()); + if (colGroup == PrimitiveGrouping.STRING_GROUP && minPrefix != null) { + // >= "min-prefix" + ExprNodeConstantDesc prefixNode = new ExprNodeConstantDesc(pattern.getTypeInfo(), minPrefix); + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + new GenericUDFOPEqualOrGreaterThan(), Arrays.asList(new ExprNodeDesc[] { column, + prefixNode })); + } + } + return null; + } + } diff --git serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java index a4323d1..341620d 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveTypeEntry; /** @@ -807,4 +808,17 @@ public static int getCharacterLengthForType(PrimitiveTypeInfo typeInfo) { return 0; } } + + /** + * Return the primitive grouping type of the input type info + * @param ti + * @return Primitive grouping of the type + */ + public static PrimitiveGrouping getPrimitiveGrouping(TypeInfo ti) { + if (ti.getCategory() == Category.PRIMITIVE) { + return PrimitiveObjectInspectorUtils.getPrimitiveGrouping(((PrimitiveTypeInfo) ti) + .getPrimitiveCategory()); + } + return null; + } }