From 3ba587a9acf0683c0ececb26ca86ebf3ea17b33a Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Mon, 28 Mar 2016 18:36:13 -0700 Subject: [PATCH] HIVE-13373 : Use most specific type for numerical constants --- .../hadoop/hive/ql/parse/TypeCheckProcFactory.java | 77 ++++---------- ql/src/test/queries/clientpositive/type_widening.q | 6 ++ .../results/clientpositive/infer_const_type.q.out | 6 +- .../results/clientpositive/type_widening.q.out | 112 +++++++++++++++++++++ .../results/clientpositive/vectorization_0.q.out | 2 +- .../vectorization_short_regress.q.out | 2 +- 6 files changed, 140 insertions(+), 65 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java index 45dfd27..5b1659e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java @@ -62,7 +62,6 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; @@ -991,69 +990,27 @@ protected ExprNodeDesc getXpathOrFuncExprNodeDesc(ASTNode expr, int constIdx = children.get(0) instanceof ExprNodeConstantDesc ? 0 : 1; - Set inferTypes = new HashSet(Arrays.asList( - serdeConstants.TINYINT_TYPE_NAME.toLowerCase(), - serdeConstants.SMALLINT_TYPE_NAME.toLowerCase(), - serdeConstants.INT_TYPE_NAME.toLowerCase(), - serdeConstants.BIGINT_TYPE_NAME.toLowerCase(), - serdeConstants.FLOAT_TYPE_NAME.toLowerCase(), - serdeConstants.DOUBLE_TYPE_NAME.toLowerCase(), - serdeConstants.STRING_TYPE_NAME.toLowerCase() - )); - String constType = children.get(constIdx).getTypeString().toLowerCase(); String columnType = children.get(1 - constIdx).getTypeString().toLowerCase(); - if (inferTypes.contains(constType) && inferTypes.contains(columnType) - && !columnType.equalsIgnoreCase(constType)) { - Object originalValue = ((ExprNodeConstantDesc) children.get(constIdx)).getValue(); - String constValue = originalValue.toString(); - boolean triedDouble = false; - Number value = null; - try { - if (columnType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)) { - value = new Byte(constValue); - } else if (columnType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)) { - value = new Short(constValue); - } else if (columnType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)) { - value = new Integer(constValue); - } else if (columnType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) { - value = new Long(constValue); - } else if (columnType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) { - value = new Float(constValue); - } else if (columnType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) { - triedDouble = true; - value = new Double(constValue); - } else if (columnType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)) { - // Don't scramble the const type information if comparing to a string column, - // It's not useful to do so; as of now, there is also a hack in - // SemanticAnalyzer#genTablePlan that causes every column to look like a string - // a string down here, so number type information is always lost otherwise. - boolean isNumber = (originalValue instanceof Number); - triedDouble = !isNumber; - value = isNumber ? (Number)originalValue : new Double(constValue); - } - } catch (NumberFormatException nfe) { - // this exception suggests the precise type inference did not succeed - // we'll try again to convert it to double - // however, if we already tried this, or the column is NUMBER type and - // the operator is EQUAL, return false due to the type mismatch - if (triedDouble && - (genericUDF instanceof GenericUDFOPEqual - && !columnType.equals(serdeConstants.STRING_TYPE_NAME))) { - return new ExprNodeConstantDesc(false); - } - - try { - value = new Double(constValue); - } catch (NumberFormatException ex) { - return new ExprNodeConstantDesc(false); - } - } - - if (value != null) { - children.set(constIdx, new ExprNodeConstantDesc(value)); + // Try to narrow type of constant + Object constVal = ((ExprNodeConstantDesc) children.get(constIdx)).getValue(); + try { + if (serdeConstants.SMALLINT_TYPE_NAME.equals(columnType) && (constVal instanceof Number || constVal instanceof String)) { + children.set(constIdx, new ExprNodeConstantDesc(new Short(constVal.toString()))); + } else if ((serdeConstants.TINYINT_TYPE_NAME.toLowerCase().equals(columnType)) && (constVal instanceof Number || constVal instanceof String)) { + children.set(constIdx, new ExprNodeConstantDesc(new Byte(constVal.toString()))); + } else if ((serdeConstants.INT_TYPE_NAME.toLowerCase().equals(columnType)) && (constVal instanceof Number || constVal instanceof String)) { + children.set(constIdx, new ExprNodeConstantDesc(new Integer(constVal.toString()))); + } else if ((serdeConstants.BIGINT_TYPE_NAME.toLowerCase().equals(columnType)) && (constVal instanceof Number || constVal instanceof String)) { + children.set(constIdx, new ExprNodeConstantDesc(new Long(constVal.toString()))); + } else if ((serdeConstants.FLOAT_TYPE_NAME.toLowerCase().equals(columnType)) && (constVal instanceof Number || constVal instanceof String)) { + children.set(constIdx, new ExprNodeConstantDesc(new Float(constVal.toString()))); + } else if ((serdeConstants.DOUBLE_TYPE_NAME.toLowerCase().equals(columnType)) && (constVal instanceof Number || constVal instanceof String)) { + children.set(constIdx, new ExprNodeConstantDesc(new Double(constVal.toString()))); } + } catch (NumberFormatException nfe) { + LOG.trace("Failed to narrow type of constant", nfe); } // if column type is char and constant type is string, then convert the constant to char diff --git a/ql/src/test/queries/clientpositive/type_widening.q b/ql/src/test/queries/clientpositive/type_widening.q index b504cf9..ab4720c 100644 --- a/ql/src/test/queries/clientpositive/type_widening.q +++ b/ql/src/test/queries/clientpositive/type_widening.q @@ -6,3 +6,9 @@ SELECT COALESCE(0, 9223372036854775807) FROM src LIMIT 1; EXPLAIN SELECT * FROM (SELECT 0 AS numcol FROM src UNION ALL SELECT 9223372036854775807 AS numcol FROM src) a ORDER BY numcol; SELECT * FROM (SELECT 0 AS numcol FROM src UNION ALL SELECT 9223372036854775807 AS numcol FROM src) a ORDER BY numcol; +create table t1(a tinyint, b smallint); +explain select * from t1 where a > 2; +explain select * from t1 where b < 2; +explain select * from t1 where a < 200; +explain select * from t1 where b > 40000; +drop table t1; diff --git a/ql/src/test/results/clientpositive/infer_const_type.q.out b/ql/src/test/results/clientpositive/infer_const_type.q.out index bd42f05..c395659 100644 --- a/ql/src/test/results/clientpositive/infer_const_type.q.out +++ b/ql/src/test/results/clientpositive/infer_const_type.q.out @@ -102,7 +102,7 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@infertypes #### A masked pattern was here #### 127 32767 12345 -12345 906.0 -307.0 1234 -WARNING: Comparing a bigint and a double may result in a loss of precision. +WARNING: Comparing a bigint and a string may result in a loss of precision. PREHOOK: query: -- all should return false as all numbers exceeed the largest number -- which could be represented by the corresponding type -- and string_col = long_const should return false @@ -137,7 +137,7 @@ STAGE PLANS: alias: infertypes Statistics: Num rows: 1 Data size: 117 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((UDFToDouble(ti) = 128.0) or (UDFToDouble(si) = 32768.0) or (UDFToDouble(i) = 2.147483648E9) or (UDFToDouble(bi) = 9.223372036854776E18)) (type: boolean) + predicate: ((UDFToDouble(ti) = 128.0) or (UDFToInteger(si) = 32768) or (UDFToDouble(i) = 2.147483648E9) or (UDFToDouble(bi) = 9.223372036854776E18) or (UDFToDouble(fl) = null) or (db = null)) (type: boolean) Statistics: Num rows: 1 Data size: 117 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ti (type: tinyint), si (type: smallint), i (type: int), bi (type: bigint), fl (type: float), db (type: double), str (type: string) @@ -157,7 +157,7 @@ STAGE PLANS: Processor Tree: ListSink -WARNING: Comparing a bigint and a double may result in a loss of precision. +WARNING: Comparing a bigint and a string may result in a loss of precision. PREHOOK: query: SELECT * FROM infertypes WHERE ti = '128' OR si = 32768 OR diff --git a/ql/src/test/results/clientpositive/type_widening.q.out b/ql/src/test/results/clientpositive/type_widening.q.out index 84e53f8..cef994b 100644 --- a/ql/src/test/results/clientpositive/type_widening.q.out +++ b/ql/src/test/results/clientpositive/type_widening.q.out @@ -1098,3 +1098,115 @@ POSTHOOK: Input: default@src 9223372036854775807 9223372036854775807 9223372036854775807 +PREHOOK: query: create table t1(a tinyint, b smallint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1 +POSTHOOK: query: create table t1(a tinyint, b smallint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: explain select * from t1 where a > 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 where a > 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: t1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (a > 2) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: a (type: tinyint), b (type: smallint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + ListSink + +PREHOOK: query: explain select * from t1 where b < 2 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 where b < 2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: t1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (b < 2) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: a (type: tinyint), b (type: smallint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + ListSink + +PREHOOK: query: explain select * from t1 where a < 200 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 where a < 200 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: t1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (UDFToInteger(a) < 200) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: a (type: tinyint), b (type: smallint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + ListSink + +PREHOOK: query: explain select * from t1 where b > 40000 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t1 where b > 40000 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: t1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: (UDFToInteger(b) > 40000) (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: a (type: tinyint), b (type: smallint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + ListSink + +PREHOOK: query: drop table t1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: drop table t1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 diff --git a/ql/src/test/results/clientpositive/vectorization_0.q.out b/ql/src/test/results/clientpositive/vectorization_0.q.out index 53dffed..b12ae68 100644 --- a/ql/src/test/results/clientpositive/vectorization_0.q.out +++ b/ql/src/test/results/clientpositive/vectorization_0.q.out @@ -1010,7 +1010,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((cstring2 like '%b%') or (79.553 <> UDFToDouble(cint)) or (UDFToDouble(cbigint) < cdouble) or ((UDFToShort(ctinyint) >= csmallint) and (cboolean2 = 1) and (3569.0 = UDFToDouble(ctinyint)))) (type: boolean) + predicate: ((cstring2 like '%b%') or (79.553 <> UDFToDouble(cint)) or (UDFToDouble(cbigint) < cdouble) or ((UDFToShort(ctinyint) >= csmallint) and (cboolean2 = 1) and (3569 = UDFToInteger(ctinyint)))) (type: boolean) Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cbigint (type: bigint), cfloat (type: float), ctinyint (type: tinyint) diff --git a/ql/src/test/results/clientpositive/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/vectorization_short_regress.q.out index 91c10f0..7691dda 100644 --- a/ql/src/test/results/clientpositive/vectorization_short_regress.q.out +++ b/ql/src/test/results/clientpositive/vectorization_short_regress.q.out @@ -1162,7 +1162,7 @@ STAGE PLANS: alias: alltypesorc Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (((197.0 > UDFToDouble(ctinyint)) and (UDFToLong(cint) = cbigint)) or (cbigint = 359) or (cboolean1 < 0) or ((cstring1 like '%ss') and (cfloat <= UDFToFloat(ctinyint)))) (type: boolean) + predicate: (((197 > UDFToInteger(ctinyint)) and (UDFToLong(cint) = cbigint)) or (cbigint = 359) or (cboolean1 < 0) or ((cstring1 like '%ss') and (cfloat <= UDFToFloat(ctinyint)))) (type: boolean) Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cint (type: int), cbigint (type: bigint), cstring1 (type: string), cboolean1 (type: boolean), cfloat (type: float), cdouble (type: double), ctimestamp2 (type: timestamp), csmallint (type: smallint), cstring2 (type: string), cboolean2 (type: boolean), (UDFToDouble(cint) / UDFToDouble(cbigint)) (type: double), (UDFToDouble(cbigint) % 79.553) (type: double), (- (UDFToDouble(cint) / UDFToDouble(cbigint))) (type: double), (10.175 % UDFToDouble(cfloat)) (type: double), (- cfloat) (type: float), (cfloat - (- cfloat)) (type: float), ((cfloat - (- cfloat)) % -6432.0) (type: float), (cdouble * UDFToDouble(csmallint)) (type: double), (- cdouble) (type: double), (- cbigint) (type: bigint), (UDFToDouble(cfloat) - (UDFToDouble(cint) / UDFToDouble(cbigint))) (type: double), (- csmallint) (type: smallint), (3569 % cbigint) (type: bigint), (359.0 - cdouble) (type: double), (- csmallint) (type: smallint) -- 1.7.12.4 (Apple Git-37)