Details
-
Bug
-
Status: Patch Available
-
Minor
-
Resolution: Unresolved
-
0.13.0
-
None
-
None
-
Hive 0.13 with Hadoop 2.4 on a 3 node cluster
Description
Using BETWEEN, a custom UDF, and vectorized query execution yields incorrect query results.
Example Query: SELECT column_1 FROM table_1 WHERE column_1 BETWEEN (UDF_1 - X) and UDF_1
The following test scenario will reproduce the problem:
TEST UDF (SIMPLE FUNCTION THAT TAKES NO ARGUMENTS AND RETURNS 10000):
package com.test;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import java.lang.String;
import java.lang.*;
public class tenThousand extends UDF {
private final LongWritable result = new LongWritable();
public LongWritable evaluate()
{ result.set(10000); return result; }}
TEST DATA (test.input):
1|CBCABC|12
2|DBCABC|13
3|EBCABC|14
40000|ABCABC|15
50000|BBCABC|16
60000|CBCABC|17
CREATING ORC TABLE:
0: jdbc:hive2://server:10002/db> create table testTabOrc (first bigint, second varchar(20), third int) partitioned by (range int) clustered by (first) sorted by (first) into 8 buckets stored as orc tblproperties ("orc.compress" = "SNAPPY", "orc.index" = "true");
CREATE LOADING TABLE:
0: jdbc:hive2://server:10002/db> create table loadingDir (first bigint, second varchar(20), third int) partitioned by (range int) row format delimited fields terminated by '|' stored as textfile;
COPY IN DATA:
[root@server]# hadoop fs -copyFromLocal /tmp/test.input /db/loading/.
ORC DATA:
[root@server]# beeline -u jdbc:hive2://server:10002/db -n root --hiveconf hive.exec.dynamic.partition.mode=nonstrict --hiveconf hive.enforce.sorting=true -e "insert into table testTabOrc partition(range) select * from loadingDir;"
LOAD TEST FUNCTION:
0: jdbc:hive2://server:10002/db> add jar /opt/hadoop/lib/testFunction.jar
0: jdbc:hive2://server:10002/db> create temporary function ten_thousand as 'com.test.tenThousand';
TURN OFF VECTORIZATION:
0: jdbc:hive2://server:10002/db> set hive.vectorized.execution.enabled=false;
QUERY (RESULTS AS EXPECTED):
0: jdbc:hive2://server:10002/db> select first from testTabOrc where first between ten_thousand()-10000 and ten_thousand()-9995;
--------
first |
--------
1 |
2 |
3 |
--------
3 rows selected (15.286 seconds)
TURN ON VECTORIZATION:
0: jdbc:hive2://server:10002/db> set hive.vectorized.execution.enabled=true;
QUERY AGAIN (WRONG RESULTS):
0: jdbc:hive2://server:10002/db> select first from testTabOrc where first between ten_thousand()-10000 and ten_thousand()-9995;
--------
first |
--------
--------
No rows selected (17.763 seconds)