Index: ql/src/test/results/clientpositive/show_functions.q.out =================================================================== --- ql/src/test/results/clientpositive/show_functions.q.out (revision 798787) +++ ql/src/test/results/clientpositive/show_functions.q.out (working copy) @@ -90,6 +90,10 @@ space split sqrt +std +stddev +stddev_pop +stddev_samp string substr substring @@ -100,6 +104,9 @@ ucase unix_timestamp upper +var_pop +var_samp +variance when year | @@ -131,6 +138,7 @@ space to_date ucase +variance query: SHOW FUNCTIONS 'log.*' log log10 Index: ql/src/test/results/clientpositive/groupby3_noskew.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_noskew.q.out (revision 798787) +++ ql/src/test/results/clientpositive/groupby3_noskew.q.out (working copy) @@ -1,9 +1,18 @@ -query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE +query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE query: EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) STAGE DEPENDENCIES: Stage-1 is a root stage @@ -33,8 +42,12 @@ expr: avg(DISTINCT KEY._col0) expr: max(KEY._col0) expr: min(KEY._col0) + expr: std(KEY._col0) + expr: stddev_samp(KEY._col0) + expr: variance(KEY._col0) + expr: var_samp(KEY._col0) mode: complete - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -47,7 +60,15 @@ type: string expr: _col4 type: string - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -60,7 +81,15 @@ type: double expr: UDFToDouble(_col4) type: double - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 File Output Operator compressed: false GlobalTableId: 1 @@ -82,10 +111,20 @@ query: FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) Input: default/src Output: default/dest1 query: SELECT dest1.* FROM dest1 Input: default/dest1 -Output: file:/data/users/zshao/tools/670-trunk-apache-hive/.ptest_1/build/ql/tmp/751604196/10000 -130091.0 260.182 256.10355987055016 98.0 0.0 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/537230860/10000 +130091.0 260.182 256.10355987055016 98.0 0.0 142.92680950752379 143.06995106518903 20428.072875999995 20469.010897795586 +query: DROP TABLE dest1 Index: ql/src/test/results/clientpositive/groupby3_map_skew.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_map_skew.q.out (revision 798787) +++ ql/src/test/results/clientpositive/groupby3_map_skew.q.out (working copy) @@ -1,9 +1,18 @@ -query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE +query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE query: EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) STAGE DEPENDENCIES: Stage-1 is a root stage @@ -27,11 +36,15 @@ expr: avg(DISTINCT substr(value, 5)) expr: max(substr(value, 5)) expr: min(substr(value, 5)) + expr: std(substr(value, 5)) + expr: stddev_samp(substr(value, 5)) + expr: variance(substr(value, 5)) + expr: var_samp(substr(value, 5)) keys: expr: substr(value, 5) type: string mode: hash - outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 Reduce Output Operator key expressions: expr: _col0 @@ -52,6 +65,14 @@ type: string expr: _col5 type: string + expr: _col6 + type: struct + expr: _col7 + type: struct + expr: _col8 + type: struct + expr: _col9 + type: struct Reduce Operator Tree: Group By Operator aggregations: @@ -60,8 +81,12 @@ expr: avg(DISTINCT KEY._col0) expr: max(VALUE._col3) expr: min(VALUE._col4) + expr: std(VALUE._col5) + expr: stddev_samp(VALUE._col6) + expr: variance(VALUE._col7) + expr: var_samp(VALUE._col8) mode: partials - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 File Output Operator compressed: false GlobalTableId: 0 @@ -72,7 +97,7 @@ Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: - file:/data/users/zshao/tools/670-trunk-apache-hive/.ptest_0/build/ql/tmp/1181571992/10002 + file:/data/users/emil/hive1/hive1/build/ql/tmp/738212968/10002 Reduce Output Operator sort order: tag: -1 @@ -87,6 +112,14 @@ type: string expr: _col4 type: string + expr: _col5 + type: struct + expr: _col6 + type: struct + expr: _col7 + type: struct + expr: _col8 + type: struct Reduce Operator Tree: Group By Operator aggregations: @@ -95,8 +128,12 @@ expr: avg(VALUE._col2) expr: max(VALUE._col3) expr: min(VALUE._col4) + expr: std(VALUE._col5) + expr: stddev_samp(VALUE._col6) + expr: variance(VALUE._col7) + expr: var_samp(VALUE._col8) mode: final - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -109,7 +146,15 @@ type: string expr: _col4 type: string - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -122,7 +167,15 @@ type: double expr: UDFToDouble(_col4) type: double - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 File Output Operator compressed: false GlobalTableId: 1 @@ -144,10 +197,20 @@ query: FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) Input: default/src Output: default/dest1 query: SELECT dest1.* FROM dest1 Input: default/dest1 -Output: file:/data/users/zshao/tools/670-trunk-apache-hive/.ptest_0/build/ql/tmp/94508894/10000 -130091.0 260.182 256.10355987055016 98.0 0.0 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/528887072/10000 +130091.0 260.182 256.10355987055016 98.0 0.0 142.9268095075238 143.06995106518906 20428.072876 20469.01089779559 +query: DROP TABLE dest1 Index: ql/src/test/results/clientpositive/groupby3_map.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3_map.q.out (revision 798787) +++ ql/src/test/results/clientpositive/groupby3_map.q.out (working copy) @@ -1,9 +1,18 @@ -query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE +query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE query: EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) STAGE DEPENDENCIES: Stage-1 is a root stage @@ -26,11 +35,15 @@ expr: avg(DISTINCT substr(value, 5)) expr: max(substr(value, 5)) expr: min(substr(value, 5)) + expr: std(substr(value, 5)) + expr: stddev_samp(substr(value, 5)) + expr: variance(substr(value, 5)) + expr: var_samp(substr(value, 5)) keys: expr: substr(value, 5) type: string mode: hash - outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 Reduce Output Operator key expressions: expr: _col0 @@ -48,6 +61,14 @@ type: string expr: _col5 type: string + expr: _col6 + type: struct + expr: _col7 + type: struct + expr: _col8 + type: struct + expr: _col9 + type: struct Reduce Operator Tree: Group By Operator aggregations: @@ -56,8 +77,12 @@ expr: avg(DISTINCT KEY._col0) expr: max(VALUE._col3) expr: min(VALUE._col4) + expr: std(VALUE._col5) + expr: stddev_samp(VALUE._col6) + expr: variance(VALUE._col7) + expr: var_samp(VALUE._col8) mode: mergepartial - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -70,7 +95,15 @@ type: string expr: _col4 type: string - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -83,7 +116,15 @@ type: double expr: UDFToDouble(_col4) type: double - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 File Output Operator compressed: false GlobalTableId: 1 @@ -105,10 +146,20 @@ query: FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) Input: default/src Output: default/dest1 query: SELECT dest1.* FROM dest1 Input: default/dest1 -Output: file:/data/users/zshao/tools/670-trunk-apache-hive/.ptest_2/build/ql/tmp/622549984/10000 -130091.0 260.182 256.10355987055016 98.0 0.0 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1984222937/10000 +130091.0 260.182 256.10355987055016 98.0 0.0 142.9268095075238 143.06995106518906 20428.072876 20469.01089779559 +query: DROP TABLE dest1 Index: ql/src/test/results/clientpositive/groupby3.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby3.q.out (revision 798787) +++ ql/src/test/results/clientpositive/groupby3.q.out (working copy) @@ -1,9 +1,18 @@ -query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE +query: CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE query: EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION sum (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTIONDI avg (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION max (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION min (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION std (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION stddev_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION variance (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5))) (TOK_SELEXPR (TOK_FUNCTION var_samp (TOK_FUNCTION substr (. (TOK_TABLE_OR_COL src) value) 5)))))) STAGE DEPENDENCIES: Stage-1 is a root stage @@ -37,8 +46,12 @@ expr: avg(DISTINCT KEY._col0) expr: max(KEY._col0) expr: min(KEY._col0) + expr: std(KEY._col0) + expr: stddev_samp(KEY._col0) + expr: variance(KEY._col0) + expr: var_samp(KEY._col0) mode: partial1 - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 File Output Operator compressed: false GlobalTableId: 0 @@ -49,7 +62,7 @@ Stage: Stage-2 Map Reduce Alias -> Map Operator Tree: - file:/data/users/zshao/tools/670-trunk-apache-hive/.ptest_2/build/ql/tmp/288308448/10002 + file:/data/users/emil/hive1/hive1/build/ql/tmp/1684499238/10002 Reduce Output Operator sort order: tag: -1 @@ -64,6 +77,14 @@ type: string expr: _col4 type: string + expr: _col5 + type: struct + expr: _col6 + type: struct + expr: _col7 + type: struct + expr: _col8 + type: struct Reduce Operator Tree: Group By Operator aggregations: @@ -72,8 +93,12 @@ expr: avg(VALUE._col2) expr: max(VALUE._col3) expr: min(VALUE._col4) + expr: std(VALUE._col5) + expr: stddev_samp(VALUE._col6) + expr: variance(VALUE._col7) + expr: var_samp(VALUE._col8) mode: final - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -86,7 +111,15 @@ type: string expr: _col4 type: string - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Select Operator expressions: expr: _col0 @@ -99,7 +132,15 @@ type: double expr: UDFToDouble(_col4) type: double - outputColumnNames: _col0, _col1, _col2, _col3, _col4 + expr: _col5 + type: double + expr: _col6 + type: double + expr: _col7 + type: double + expr: _col8 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 File Output Operator compressed: false GlobalTableId: 1 @@ -121,10 +162,20 @@ query: FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)) +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)) Input: default/src Output: default/dest1 query: SELECT dest1.* FROM dest1 Input: default/dest1 -Output: file:/data/users/zshao/tools/670-trunk-apache-hive/.ptest_2/build/ql/tmp/1560730520/10000 -130091.0 260.182 256.10355987055016 98.0 0.0 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1807842455/10000 +130091.0 260.182 256.10355987055016 98.0 0.0 142.92680950752379 143.06995106518903 20428.072875999995 20469.010897795586 +query: DROP TABLE dest1 Index: ql/src/test/queries/clientpositive/groupby3_map_skew.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3_map_skew.q (revision 798787) +++ ql/src/test/queries/clientpositive/groupby3_map_skew.q (working copy) @@ -2,13 +2,33 @@ set hive.groupby.skewindata=true; set mapred.reduce.tasks=31; -CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE; +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE; EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); SELECT dest1.* FROM dest1; + +DROP TABLE dest1; Index: ql/src/test/queries/clientpositive/groupby3_map.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3_map.q (revision 798787) +++ ql/src/test/queries/clientpositive/groupby3_map.q (working copy) @@ -2,13 +2,33 @@ set hive.groupby.skewindata=false; set mapred.reduce.tasks=31; -CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE; +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE; EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); SELECT dest1.* FROM dest1; + +DROP TABLE dest1; Index: ql/src/test/queries/clientpositive/groupby3.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3.q (revision 798787) +++ ql/src/test/queries/clientpositive/groupby3.q (working copy) @@ -1,13 +1,33 @@ set hive.map.aggr=false; set hive.groupby.skewindata=true; -CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE; +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE; EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); SELECT dest1.* FROM dest1; + +DROP TABLE dest1; Index: ql/src/test/queries/clientpositive/groupby3_noskew.q =================================================================== --- ql/src/test/queries/clientpositive/groupby3_noskew.q (revision 798787) +++ ql/src/test/queries/clientpositive/groupby3_noskew.q (working copy) @@ -3,13 +3,34 @@ set hive.groupby.skewindata=false; set mapred.reduce.tasks=31; -CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE) STORED AS TEXTFILE; +CREATE TABLE dest1(c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, c4 DOUBLE, c5 DOUBLE, c6 DOUBLE, c7 DOUBLE, c8 DOUBLE, c9 DOUBLE) STORED AS TEXTFILE; EXPLAIN FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); FROM src -INSERT OVERWRITE TABLE dest1 SELECT sum(substr(src.value,5)), avg(substr(src.value,5)), avg(DISTINCT substr(src.value,5)), max(substr(src.value,5)), min(substr(src.value,5)); +INSERT OVERWRITE TABLE dest1 SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + avg(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + std(substr(src.value,5)), + stddev_samp(substr(src.value,5)), + variance(substr(src.value,5)), + var_samp(substr(src.value,5)); SELECT dest1.* FROM dest1; + +DROP TABLE dest1; + Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 798787) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -181,6 +181,15 @@ registerGenericUDAF("sum", new GenericUDAFSum()); registerGenericUDAF("count", new GenericUDAFCount()); registerGenericUDAF("avg", new GenericUDAFAverage()); + + registerGenericUDAF("std", new GenericUDAFStd()); + registerGenericUDAF("stddev", new GenericUDAFStd()); + registerGenericUDAF("stddev_pop", new GenericUDAFStd()); + registerGenericUDAF("stddev_samp", new GenericUDAFStdSample()); + registerGenericUDAF("variance", new GenericUDAFVariance()); + registerGenericUDAF("var_pop", new GenericUDAFVariance()); + registerGenericUDAF("var_samp", new GenericUDAFVarianceSample()); + registerUDAF("max", UDAFMax.class); registerUDAF("min", UDAFMin.class); Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java (revision 0) @@ -0,0 +1,258 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import java.util.ArrayList; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.LongWritable; + +public class GenericUDAFVariance implements GenericUDAFResolver { + + @Override + public GenericUDAFEvaluator getEvaluator( + TypeInfo[] parameters) throws SemanticException { + if (parameters.length != 1) { + throw new UDFArgumentTypeException(parameters.length - 1, + "Exactly one argument is expected."); + } + + if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(0, + "Only primitive type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + switch (((PrimitiveTypeInfo)parameters[0]).getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case STRING: + return new GenericUDAFVarianceEvaluator(); + case BOOLEAN: + default: + throw new UDFArgumentTypeException(0, + "Only numeric or string type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + } + + /** + * Evaluate the variance using the following modification of the formula from + * The Art of Computer Programming, vol. 2, p. 232: + * + * variance = variance1 + variance2 + n*alpha^2 + m*betha^2 + * + * where: + * - variance is sum[x-avg^2] (this is actually n times the variance) and is + * updated at every step. + * - n is the count of elements in chunk1 + * - m is the count of elements in chunk2 + * - alpha = avg-a + * - betha = avg-b + * - avg is the the average of all elements from both chunks + * - a is the average of elements in chunk1 + * - b is the average of elements in chunk2 + * + */ + public static class GenericUDAFVarianceEvaluator extends GenericUDAFEvaluator { + + // For PARTIAL1 and COMPLETE + PrimitiveObjectInspector inputOI; + + // For PARTIAL2 and FINAL + StructObjectInspector soi; + StructField countField; + StructField sumField; + StructField varianceField; + LongObjectInspector countFieldOI; + DoubleObjectInspector sumFieldOI; + DoubleObjectInspector varianceFieldOI; + + // For PARTIAL1 and PARTIAL2 + Object[] partialResult; + + // For FINAL and COMPLETE + DoubleWritable result; + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { + assert(parameters.length == 1); + super.init(m, parameters); + + // init input + if (mode == mode.PARTIAL1 || mode == mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector)parameters[0]; + } else { + soi = (StructObjectInspector)parameters[0]; + countField = soi.getStructFieldRef("count"); + sumField = soi.getStructFieldRef("sum"); + varianceField = soi.getStructFieldRef("variance"); + countFieldOI = + (LongObjectInspector)countField.getFieldObjectInspector(); + sumFieldOI = (DoubleObjectInspector)sumField.getFieldObjectInspector(); + varianceFieldOI = + (DoubleObjectInspector)varianceField.getFieldObjectInspector(); + } + + // init output + if (mode == mode.PARTIAL1 || mode == mode.PARTIAL2) { + // The output of a partial aggregation is a struct containing + // a "long" count and a "double" sum and variance. + + ArrayList foi = new ArrayList(); + foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + ArrayList fname = new ArrayList(); + fname.add("count"); + fname.add("sum"); + fname.add("variance"); + partialResult = new Object[3]; + partialResult[0] = new LongWritable(0); + partialResult[1] = new DoubleWritable(0); + partialResult[2] = new DoubleWritable(0); + return ObjectInspectorFactory.getStandardStructObjectInspector( + fname, foi); + + } else { + result = new DoubleWritable(0); + return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; + } + } + + static class StdAgg implements AggregationBuffer { + long count; + double sum; + double variance; + }; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + StdAgg result = new StdAgg(); + reset(result); + return result; + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + StdAgg myagg = (StdAgg)agg; + myagg.count = 0; + myagg.sum = 0; + myagg.variance = 0; + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + assert(parameters.length == 1); + Object p = parameters[0]; + if (p != null) { + StdAgg myagg = (StdAgg)agg; + double v = PrimitiveObjectInspectorUtils.getDouble(p, + (PrimitiveObjectInspector)inputOI); + + if(myagg.count != 0) { + double alpha = (myagg.sum + v) / (myagg.count+1) + - myagg.sum / myagg.count; + double betha = (myagg.sum + v) / (myagg.count+1) - v; + + myagg.variance += myagg.count*alpha*alpha + betha*betha; + } + myagg.count++; + myagg.sum += v; + } + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + StdAgg myagg = (StdAgg)agg; + ((LongWritable)partialResult[0]).set(myagg.count); + ((DoubleWritable)partialResult[1]).set(myagg.sum); + ((DoubleWritable)partialResult[2]).set(myagg.variance); + return partialResult; + } + + @Override + public void merge(AggregationBuffer agg, Object partial) throws HiveException { + if (partial != null) { + StdAgg myagg = (StdAgg)agg; + Object partialCount = soi.getStructFieldData(partial, countField); + Object partialSum = soi.getStructFieldData(partial, sumField); + Object partialVariance = soi.getStructFieldData(partial, varianceField); + + long n = myagg.count; + long m = countFieldOI.get(partialCount); + + if(n == 0) { + myagg.variance = sumFieldOI.get(partialVariance); + myagg.count = countFieldOI.get(partialCount); + myagg.sum = sumFieldOI.get(partialSum); + } + + if(m != 0 && n != 0) { + double a = myagg.sum; + double b = sumFieldOI.get(partialSum); + + double alpha = (a+b)/(n+m) - a/n; + double betha = (a+b)/(n+m) - b/m; + + myagg.variance += sumFieldOI.get(partialVariance) + + (n*alpha*alpha + m*betha*betha); + myagg.count += m; + myagg.sum += b; + } + + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + StdAgg myagg = (StdAgg)agg; + + if (myagg.count == 0) { // SQL standard - return null for zero elements + return null; + } else { + if(myagg.count > 1) { + result.set(myagg.variance / (myagg.count)); + } else { // for one element the variance is always 0 + result.set(0); + } + return result; + } + } + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStd.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStd.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStd.java (revision 0) @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * This class finds standard deviation and variance, according to its type: + * POP_VARIANCE + * POP_STD_DEV + * SAMP_VARIANCE + * SAMP_STD_DEV + */ +public class GenericUDAFStd extends GenericUDAFVariance { + + @Override + public GenericUDAFEvaluator getEvaluator( + TypeInfo[] parameters) throws SemanticException { + if (parameters.length != 1) { + throw new UDFArgumentTypeException(parameters.length - 1, + "Exactly one argument is expected."); + } + + if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(0, + "Only primitive type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + switch (((PrimitiveTypeInfo)parameters[0]).getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case STRING: + return new GenericUDAFStdEvaluator(); + case BOOLEAN: + default: + throw new UDFArgumentTypeException(0, + "Only numeric or string type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + } + + public static class GenericUDAFStdEvaluator extends GenericUDAFVarianceEvaluator { + + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + StdAgg myagg = (StdAgg)agg; + + if (myagg.count == 0) { // SQL standard - return null for zero elements + return null; + } else { + if(myagg.count > 1) { + result.set(Math.sqrt(myagg.variance / (myagg.count))); + } else { // for one element the variance is always 0 + result.set(0); + } + return result; + } + } + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVarianceSample.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVarianceSample.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVarianceSample.java (revision 0) @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * This class finds standard deviation and variance, according to its type: + * POP_VARIANCE + * POP_STD_DEV + * SAMP_VARIANCE + * SAMP_STD_DEV + */ +public class GenericUDAFVarianceSample extends GenericUDAFVariance { + + @Override + public GenericUDAFEvaluator getEvaluator( + TypeInfo[] parameters) throws SemanticException { + if (parameters.length != 1) { + throw new UDFArgumentTypeException(parameters.length - 1, + "Exactly one argument is expected."); + } + + if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(0, + "Only primitive type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + switch (((PrimitiveTypeInfo)parameters[0]).getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case STRING: + return new GenericUDAFVarianceSampleEvaluator(); + case BOOLEAN: + default: + throw new UDFArgumentTypeException(0, + "Only numeric or string type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + } + + public static class GenericUDAFVarianceSampleEvaluator extends GenericUDAFVarianceEvaluator { + + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + StdAgg myagg = (StdAgg)agg; + + if (myagg.count == 0) { // SQL standard - return null for zero elements + return null; + } else { + if(myagg.count > 1) { + result.set(myagg.variance / (myagg.count-1)); + } else { // for one element the variance is always 0 + result.set(0); + } + return result; + } + } + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStdSample.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStdSample.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStdSample.java (revision 0) @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * This class finds standard deviation and variance, according to its type: + * POP_VARIANCE + * POP_STD_DEV + * SAMP_VARIANCE + * SAMP_STD_DEV + */ +public class GenericUDAFStdSample extends GenericUDAFVariance { + + @Override + public GenericUDAFEvaluator getEvaluator( + TypeInfo[] parameters) throws SemanticException { + if (parameters.length != 1) { + throw new UDFArgumentTypeException(parameters.length - 1, + "Exactly one argument is expected."); + } + + if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(0, + "Only primitive type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + switch (((PrimitiveTypeInfo)parameters[0]).getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case STRING: + return new GenericUDAFStdSampleEvaluator(); + case BOOLEAN: + default: + throw new UDFArgumentTypeException(0, + "Only numeric or string type arguments are accepted but " + + parameters[0].getTypeName() + " is passed."); + } + } + + public static class GenericUDAFStdSampleEvaluator extends GenericUDAFVarianceEvaluator { + + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + StdAgg myagg = (StdAgg)agg; + + if (myagg.count == 0) { // SQL standard - return null for zero elements + return null; + } else { + if(myagg.count > 1) { + result.set(Math.sqrt(myagg.variance / (myagg.count-1))); + } else { // for one element the variance is always 0 + result.set(0); + } + return result; + } + } + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/udf/UDAFStd.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDAFStd.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDAFStd.java (revision 0) @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf; + +import org.apache.hadoop.hive.ql.exec.NumericUDAF; +import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.io.Text; + + + +public class UDAFStd extends NumericUDAF { + + public static class UDAFStdEvaluator implements UDAFEvaluator { + private long mCount; + private double mSum; + private double mSumOfSquares; + + public UDAFStdEvaluator() { + super(); + init(); + } + + public void init() { + mSum = 0; + mCount = 0; + mSumOfSquares = 0; + } + + public boolean iterate(DoubleWritable o) { + if (o != null) { + mSum += o.get(); + mSumOfSquares += o.get()*o.get(); + mCount ++; + } + return true; + } + + public Text terminatePartial() { + // This is SQL standard - std of zero items should be null. + return mCount == 0 ? null : new Text(String.valueOf(mSum) + '/' + String.valueOf(mCount) + '+' + String.valueOf(mSumOfSquares)); + } + + public boolean merge(Text o) { + if (o != null) { + String s = o.toString(); + int pos1 = s.indexOf('/'); + int pos2 = s.indexOf('+'); + assert(pos1 != -1 && pos2 > pos1); + mSum += Double.parseDouble(s.substring(0, pos1)); + mCount += Long.parseLong(s.substring(pos1+1, pos2)); + mSumOfSquares += Double.parseDouble(s.substring(pos2+1)); + } + return true; + } + + public DoubleWritable terminate() { + // This is SQL standard - std of zero items should be null. + return mCount == 0 ? null : new DoubleWritable(mSum*mSum / mCount*mCount - mSumOfSquares / mCount); + } + } + +}