Description
With the Tez engine and Hive tables stored as ORC, built-in aggregate functions may produce incorrect values when all values being aggregated in a group are NULL as the result of an expression.
-- Test 1 -- The S column is populated as NaN SET hive.execution.engine=tez; CREATE TABLE LM ( D STRING, X DOUBLE ) STORED AS ORC; INSERT INTO TABLE LM VALUES ('2016-05-11',NULL), ('2016-05-11',NULL), ('2016-05-11',NULL), ('2016-05-12',NULL), ('2016-05-12',NULL), ('2016-05-12',NULL); SELECT D, MIN(X + 3) AS S FROM LM GROUP BY D; -- Test 2 -- The S column will be populated as 1 (dangerous case!) SET hive.execution.engine=tez; CREATE TABLE LM ( D STRING, X INT ) STORED AS ORC; INSERT INTO TABLE LM VALUES ('2016-05-11',NULL), ('2016-05-11',NULL), ('2016-05-11',NULL), ('2016-05-12',NULL), ('2016-05-12',NULL), ('2016-05-12',NULL); SELECT D, MIN(X + 3) AS S FROM LM GROUP BY D; -- Workaound: -- According to my tests, a workaround is to surround the nullable expression with -- COALESCE. Example: CREATE TABLE LM ( D STRING, X INT ) STORED AS ORC; INSERT INTO TABLE LM VALUES ('2016-05-11',NULL), ('2016-05-11',NULL), ('2016-05-11',NULL), ('2016-05-12',NULL), ('2016-05-12',NULL), ('2016-05-12',NULL); SELECT D, MIN(COALESCE(X + 3)) AS S FROM LM GROUP BY D;