Index: ql/src/test/results/clientpositive/udf_percentile.q.out =================================================================== --- ql/src/test/results/clientpositive/udf_percentile.q.out (revision 0) +++ ql/src/test/results/clientpositive/udf_percentile.q.out (revision 0) @@ -0,0 +1,300 @@ +PREHOOK: query: DESCRIBE FUNCTION percentile +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION percentile +POSTHOOK: type: DESCFUNCTION +percentile(expr, pc) - Returns the percentile(s) of expr at pc (range: [0,1]).pc can be a double or double array +PREHOOK: query: DESCRIBE FUNCTION EXTENDED percentile +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION EXTENDED percentile +POSTHOOK: type: DESCFUNCTION +percentile(expr, pc) - Returns the percentile(s) of expr at pc (range: [0,1]).pc can be a double or double array +PREHOOK: query: DESCRIBE FUNCTION percentile +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION percentile +POSTHOOK: type: DESCFUNCTION +percentile(expr, pc) - Returns the percentile(s) of expr at pc (range: [0,1]).pc can be a double or double array +PREHOOK: query: DESCRIBE FUNCTION EXTENDED percentile +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION EXTENDED percentile +POSTHOOK: type: DESCFUNCTION +percentile(expr, pc) - Returns the percentile(s) of expr at pc (range: [0,1]).pc can be a double or double array +PREHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-31_164_6342123421987780799/10000 +POSTHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-31_164_6342123421987780799/10000 +0 0.0 4.5 9.0 [0.0,4.5,8.91,9.0] +1 10.0 15.0 19.0 [10.0,15.0,18.91,19.0] +2 20.0 26.0 28.0 [20.0,26.0,27.939999999999998,28.0] +3 30.0 35.0 37.0 [30.0,35.0,37.0,37.0] +4 41.0 42.5 47.0 [41.0,42.5,46.849999999999994,47.0] +5 51.0 54.0 58.0 [51.0,54.0,58.0,58.0] +6 64.0 66.5 69.0 [64.0,66.5,68.9,69.0] +7 70.0 73.0 78.0 [70.0,73.0,77.91000000000001,78.0] +8 80.0 84.0 87.0 [80.0,84.0,86.92,87.0] +9 90.0 95.0 98.0 [90.0,95.0,98.0,98.0] +10 100.0 103.0 105.0 [100.0,103.0,104.94,105.0] +11 111.0 117.0 119.0 [111.0,117.0,119.0,119.0] +12 120.0 127.0 129.0 [120.0,127.0,129.0,129.0] +13 131.0 137.0 138.0 [131.0,137.0,138.0,138.0] +14 143.0 146.0 149.0 [143.0,146.0,149.0,149.0] +15 150.0 154.0 158.0 [150.0,154.0,157.92999999999998,158.0] +16 160.0 166.5 169.0 [160.0,166.5,169.0,169.0] +17 170.0 175.0 179.0 [170.0,175.0,179.0,179.0] +18 180.0 186.5 189.0 [180.0,186.5,188.86,189.0] +19 190.0 194.5 199.0 [190.0,194.5,199.0,199.0] +20 200.0 205.0 209.0 [200.0,205.0,209.0,209.0] +21 213.0 216.5 219.0 [213.0,216.5,219.0,219.0] +22 221.0 224.0 229.0 [221.0,224.0,229.0,229.0] +23 230.0 234.0 239.0 [230.0,234.0,239.0,239.0] +24 241.0 244.0 249.0 [241.0,244.0,248.94,249.0] +25 252.0 256.0 258.0 [252.0,256.0,257.94,258.0] +26 260.0 264.0 266.0 [260.0,264.0,265.95,266.0] +27 272.0 275.0 278.0 [272.0,275.0,278.0,278.0] +28 280.0 283.5 289.0 [280.0,283.5,288.87,289.0] +29 291.0 297.0 298.0 [291.0,297.0,298.0,298.0] +30 302.0 307.0 309.0 [302.0,307.0,309.0,309.0] +31 310.0 316.0 318.0 [310.0,316.0,318.0,318.0] +32 321.0 324.0 327.0 [321.0,324.0,327.0,327.0] +33 331.0 333.0 339.0 [331.0,333.0,338.92,339.0] +34 341.0 345.0 348.0 [341.0,345.0,348.0,348.0] +35 351.0 353.0 356.0 [351.0,353.0,355.91,356.0] +36 360.0 367.0 369.0 [360.0,367.0,369.0,369.0] +37 373.0 376.0 379.0 [373.0,376.0,378.95,379.0] +38 382.0 384.0 389.0 [382.0,384.0,388.82,389.0] +39 392.0 396.0 399.0 [392.0,396.0,399.0,399.0] +40 400.0 403.5 409.0 [400.0,403.5,409.0,409.0] +41 411.0 415.5 419.0 [411.0,415.5,418.91,419.0] +42 421.0 425.5 429.0 [421.0,425.5,429.0,429.0] +43 430.0 435.0 439.0 [430.0,435.0,439.0,439.0] +44 443.0 446.0 449.0 [443.0,446.0,448.96,449.0] +45 452.0 455.0 459.0 [452.0,455.0,459.0,459.0] +46 460.0 467.5 469.0 [460.0,467.5,469.0,469.0] +47 470.0 477.0 479.0 [470.0,477.0,478.94,479.0] +48 480.0 484.0 489.0 [480.0,484.0,489.0,489.0] +49 490.0 494.5 498.0 [490.0,494.5,498.0,498.0] +PREHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-35_360_2553909716316899335/10000 +POSTHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-35_360_2553909716316899335/10000 +0 0.0 4.5 9.0 [0.0,4.5,8.91,9.0] +1 10.0 15.0 19.0 [10.0,15.0,18.91,19.0] +2 20.0 26.0 28.0 [20.0,26.0,27.939999999999998,28.0] +3 30.0 35.0 37.0 [30.0,35.0,37.0,37.0] +4 41.0 42.5 47.0 [41.0,42.5,46.849999999999994,47.0] +5 51.0 54.0 58.0 [51.0,54.0,58.0,58.0] +6 64.0 66.5 69.0 [64.0,66.5,68.9,69.0] +7 70.0 73.0 78.0 [70.0,73.0,77.91000000000001,78.0] +8 80.0 84.0 87.0 [80.0,84.0,86.92,87.0] +9 90.0 95.0 98.0 [90.0,95.0,98.0,98.0] +10 100.0 103.0 105.0 [100.0,103.0,104.94,105.0] +11 111.0 117.0 119.0 [111.0,117.0,119.0,119.0] +12 120.0 127.0 129.0 [120.0,127.0,129.0,129.0] +13 131.0 137.0 138.0 [131.0,137.0,138.0,138.0] +14 143.0 146.0 149.0 [143.0,146.0,149.0,149.0] +15 150.0 154.0 158.0 [150.0,154.0,157.92999999999998,158.0] +16 160.0 166.5 169.0 [160.0,166.5,169.0,169.0] +17 170.0 175.0 179.0 [170.0,175.0,179.0,179.0] +18 180.0 186.5 189.0 [180.0,186.5,188.86,189.0] +19 190.0 194.5 199.0 [190.0,194.5,199.0,199.0] +20 200.0 205.0 209.0 [200.0,205.0,209.0,209.0] +21 213.0 216.5 219.0 [213.0,216.5,219.0,219.0] +22 221.0 224.0 229.0 [221.0,224.0,229.0,229.0] +23 230.0 234.0 239.0 [230.0,234.0,239.0,239.0] +24 241.0 244.0 249.0 [241.0,244.0,248.94,249.0] +25 252.0 256.0 258.0 [252.0,256.0,257.94,258.0] +26 260.0 264.0 266.0 [260.0,264.0,265.95,266.0] +27 272.0 275.0 278.0 [272.0,275.0,278.0,278.0] +28 280.0 283.5 289.0 [280.0,283.5,288.87,289.0] +29 291.0 297.0 298.0 [291.0,297.0,298.0,298.0] +30 302.0 307.0 309.0 [302.0,307.0,309.0,309.0] +31 310.0 316.0 318.0 [310.0,316.0,318.0,318.0] +32 321.0 324.0 327.0 [321.0,324.0,327.0,327.0] +33 331.0 333.0 339.0 [331.0,333.0,338.92,339.0] +34 341.0 345.0 348.0 [341.0,345.0,348.0,348.0] +35 351.0 353.0 356.0 [351.0,353.0,355.91,356.0] +36 360.0 367.0 369.0 [360.0,367.0,369.0,369.0] +37 373.0 376.0 379.0 [373.0,376.0,378.95,379.0] +38 382.0 384.0 389.0 [382.0,384.0,388.82,389.0] +39 392.0 396.0 399.0 [392.0,396.0,399.0,399.0] +40 400.0 403.5 409.0 [400.0,403.5,409.0,409.0] +41 411.0 415.5 419.0 [411.0,415.5,418.91,419.0] +42 421.0 425.5 429.0 [421.0,425.5,429.0,429.0] +43 430.0 435.0 439.0 [430.0,435.0,439.0,439.0] +44 443.0 446.0 449.0 [443.0,446.0,448.96,449.0] +45 452.0 455.0 459.0 [452.0,455.0,459.0,459.0] +46 460.0 467.5 469.0 [460.0,467.5,469.0,469.0] +47 470.0 477.0 479.0 [470.0,477.0,478.94,479.0] +48 480.0 484.0 489.0 [480.0,484.0,489.0,489.0] +49 490.0 494.5 498.0 [490.0,494.5,498.0,498.0] +PREHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-39_470_6143785683094088204/10000 +POSTHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-39_470_6143785683094088204/10000 +0 0.0 4.5 9.0 [0.0,4.5,8.91,9.0] +1 10.0 15.0 19.0 [10.0,15.0,18.91,19.0] +2 20.0 26.0 28.0 [20.0,26.0,27.939999999999998,28.0] +3 30.0 35.0 37.0 [30.0,35.0,37.0,37.0] +4 41.0 42.5 47.0 [41.0,42.5,46.849999999999994,47.0] +5 51.0 54.0 58.0 [51.0,54.0,58.0,58.0] +6 64.0 66.5 69.0 [64.0,66.5,68.9,69.0] +7 70.0 73.0 78.0 [70.0,73.0,77.91000000000001,78.0] +8 80.0 84.0 87.0 [80.0,84.0,86.92,87.0] +9 90.0 95.0 98.0 [90.0,95.0,98.0,98.0] +10 100.0 103.0 105.0 [100.0,103.0,104.94,105.0] +11 111.0 117.0 119.0 [111.0,117.0,119.0,119.0] +12 120.0 127.0 129.0 [120.0,127.0,129.0,129.0] +13 131.0 137.0 138.0 [131.0,137.0,138.0,138.0] +14 143.0 146.0 149.0 [143.0,146.0,149.0,149.0] +15 150.0 154.0 158.0 [150.0,154.0,157.92999999999998,158.0] +16 160.0 166.5 169.0 [160.0,166.5,169.0,169.0] +17 170.0 175.0 179.0 [170.0,175.0,179.0,179.0] +18 180.0 186.5 189.0 [180.0,186.5,188.86,189.0] +19 190.0 194.5 199.0 [190.0,194.5,199.0,199.0] +20 200.0 205.0 209.0 [200.0,205.0,209.0,209.0] +21 213.0 216.5 219.0 [213.0,216.5,219.0,219.0] +22 221.0 224.0 229.0 [221.0,224.0,229.0,229.0] +23 230.0 234.0 239.0 [230.0,234.0,239.0,239.0] +24 241.0 244.0 249.0 [241.0,244.0,248.94,249.0] +25 252.0 256.0 258.0 [252.0,256.0,257.94,258.0] +26 260.0 264.0 266.0 [260.0,264.0,265.95,266.0] +27 272.0 275.0 278.0 [272.0,275.0,278.0,278.0] +28 280.0 283.5 289.0 [280.0,283.5,288.87,289.0] +29 291.0 297.0 298.0 [291.0,297.0,298.0,298.0] +30 302.0 307.0 309.0 [302.0,307.0,309.0,309.0] +31 310.0 316.0 318.0 [310.0,316.0,318.0,318.0] +32 321.0 324.0 327.0 [321.0,324.0,327.0,327.0] +33 331.0 333.0 339.0 [331.0,333.0,338.92,339.0] +34 341.0 345.0 348.0 [341.0,345.0,348.0,348.0] +35 351.0 353.0 356.0 [351.0,353.0,355.91,356.0] +36 360.0 367.0 369.0 [360.0,367.0,369.0,369.0] +37 373.0 376.0 379.0 [373.0,376.0,378.95,379.0] +38 382.0 384.0 389.0 [382.0,384.0,388.82,389.0] +39 392.0 396.0 399.0 [392.0,396.0,399.0,399.0] +40 400.0 403.5 409.0 [400.0,403.5,409.0,409.0] +41 411.0 415.5 419.0 [411.0,415.5,418.91,419.0] +42 421.0 425.5 429.0 [421.0,425.5,429.0,429.0] +43 430.0 435.0 439.0 [430.0,435.0,439.0,439.0] +44 443.0 446.0 449.0 [443.0,446.0,448.96,449.0] +45 452.0 455.0 459.0 [452.0,455.0,459.0,459.0] +46 460.0 467.5 469.0 [460.0,467.5,469.0,469.0] +47 470.0 477.0 479.0 [470.0,477.0,478.94,479.0] +48 480.0 484.0 489.0 [480.0,484.0,489.0,489.0] +49 490.0 494.5 498.0 [490.0,494.5,498.0,498.0] +PREHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-46_553_5031670487614509596/10000 +POSTHOOK: query: SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/data/users/zshao/hadoop_hive_trunk/build/ql/scratchdir/hive_2010-02-26_17-36-46_553_5031670487614509596/10000 +0 0.0 4.5 9.0 [0.0,4.5,8.91,9.0] +1 10.0 15.0 19.0 [10.0,15.0,18.91,19.0] +2 20.0 26.0 28.0 [20.0,26.0,27.939999999999998,28.0] +3 30.0 35.0 37.0 [30.0,35.0,37.0,37.0] +4 41.0 42.5 47.0 [41.0,42.5,46.849999999999994,47.0] +5 51.0 54.0 58.0 [51.0,54.0,58.0,58.0] +6 64.0 66.5 69.0 [64.0,66.5,68.9,69.0] +7 70.0 73.0 78.0 [70.0,73.0,77.91000000000001,78.0] +8 80.0 84.0 87.0 [80.0,84.0,86.92,87.0] +9 90.0 95.0 98.0 [90.0,95.0,98.0,98.0] +10 100.0 103.0 105.0 [100.0,103.0,104.94,105.0] +11 111.0 117.0 119.0 [111.0,117.0,119.0,119.0] +12 120.0 127.0 129.0 [120.0,127.0,129.0,129.0] +13 131.0 137.0 138.0 [131.0,137.0,138.0,138.0] +14 143.0 146.0 149.0 [143.0,146.0,149.0,149.0] +15 150.0 154.0 158.0 [150.0,154.0,157.92999999999998,158.0] +16 160.0 166.5 169.0 [160.0,166.5,169.0,169.0] +17 170.0 175.0 179.0 [170.0,175.0,179.0,179.0] +18 180.0 186.5 189.0 [180.0,186.5,188.86,189.0] +19 190.0 194.5 199.0 [190.0,194.5,199.0,199.0] +20 200.0 205.0 209.0 [200.0,205.0,209.0,209.0] +21 213.0 216.5 219.0 [213.0,216.5,219.0,219.0] +22 221.0 224.0 229.0 [221.0,224.0,229.0,229.0] +23 230.0 234.0 239.0 [230.0,234.0,239.0,239.0] +24 241.0 244.0 249.0 [241.0,244.0,248.94,249.0] +25 252.0 256.0 258.0 [252.0,256.0,257.94,258.0] +26 260.0 264.0 266.0 [260.0,264.0,265.95,266.0] +27 272.0 275.0 278.0 [272.0,275.0,278.0,278.0] +28 280.0 283.5 289.0 [280.0,283.5,288.87,289.0] +29 291.0 297.0 298.0 [291.0,297.0,298.0,298.0] +30 302.0 307.0 309.0 [302.0,307.0,309.0,309.0] +31 310.0 316.0 318.0 [310.0,316.0,318.0,318.0] +32 321.0 324.0 327.0 [321.0,324.0,327.0,327.0] +33 331.0 333.0 339.0 [331.0,333.0,338.92,339.0] +34 341.0 345.0 348.0 [341.0,345.0,348.0,348.0] +35 351.0 353.0 356.0 [351.0,353.0,355.91,356.0] +36 360.0 367.0 369.0 [360.0,367.0,369.0,369.0] +37 373.0 376.0 379.0 [373.0,376.0,378.95,379.0] +38 382.0 384.0 389.0 [382.0,384.0,388.82,389.0] +39 392.0 396.0 399.0 [392.0,396.0,399.0,399.0] +40 400.0 403.5 409.0 [400.0,403.5,409.0,409.0] +41 411.0 415.5 419.0 [411.0,415.5,418.91,419.0] +42 421.0 425.5 429.0 [421.0,425.5,429.0,429.0] +43 430.0 435.0 439.0 [430.0,435.0,439.0,439.0] +44 443.0 446.0 449.0 [443.0,446.0,448.96,449.0] +45 452.0 455.0 459.0 [452.0,455.0,459.0,459.0] +46 460.0 467.5 469.0 [460.0,467.5,469.0,469.0] +47 470.0 477.0 479.0 [470.0,477.0,478.94,479.0] +48 480.0 484.0 489.0 [480.0,484.0,489.0,489.0] +49 490.0 494.5 498.0 [490.0,494.5,498.0,498.0] Index: ql/src/test/queries/clientpositive/udf_percentile.q =================================================================== --- ql/src/test/queries/clientpositive/udf_percentile.q (revision 0) +++ ql/src/test/queries/clientpositive/udf_percentile.q (revision 0) @@ -0,0 +1,54 @@ +DESCRIBE FUNCTION percentile; +DESCRIBE FUNCTION EXTENDED percentile; + +DESCRIBE FUNCTION percentile; +DESCRIBE FUNCTION EXTENDED percentile; + + +set hive.map.aggr = false; +set hive.groupby.skewindata = false; + +SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10; + + +set hive.map.aggr = true; +set hive.groupby.skewindata = false; + +SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10; + + + +set hive.map.aggr = false; +set hive.groupby.skewindata = true; + +SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10; + + +set hive.map.aggr = true; +set hive.groupby.skewindata = true; + +SELECT CAST(key AS INT) DIV 10, + percentile(CAST(substr(value, 5) AS INT), 0.0), + percentile(CAST(substr(value, 5) AS INT), 0.5), + percentile(CAST(substr(value, 5) AS INT), 1.0), + percentile(CAST(substr(value, 5) AS INT), array(0.0, 0.5, 0.99, 1.0)) +FROM src +GROUP BY CAST(key AS INT) DIV 10; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 916812) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -37,6 +37,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.udf.UDAFPercentile; import org.apache.hadoop.hive.ql.udf.UDFAbs; import org.apache.hadoop.hive.ql.udf.UDFAcos; import org.apache.hadoop.hive.ql.udf.UDFAscii; @@ -167,7 +168,6 @@ /** * FunctionRegistry. - * */ public final class FunctionRegistry { @@ -322,6 +322,8 @@ registerGenericUDAF("var_pop", new GenericUDAFVariance()); registerGenericUDAF("var_samp", new GenericUDAFVarianceSample()); + registerUDAF("percentile", UDAFPercentile.class); + // Generic UDFs registerGenericUDF("array", GenericUDFArray.class); registerGenericUDF("map", GenericUDFMap.class); Index: ql/src/java/org/apache/hadoop/hive/ql/udf/UDAFPercentile.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDAFPercentile.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDAFPercentile.java (revision 0) @@ -0,0 +1,279 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDAF; +import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.io.LongWritable; + +/** + * UDAF for calculating the percentile values. + */ +@Description(name = "percentile", + value = "_FUNC_(expr, pc) - Returns the percentile(s) of expr at pc (range: [0,1])." + + "pc can be a double or double array") +public class UDAFPercentile extends UDAF { + + /** + * A state class to store intermediate aggregation results. + */ + public static class State { + private Map counts; + private List percentiles; + } + + /** + * A comparator to sort the entries in order. + */ + public static class MyComparator implements Comparator> { + @Override + public int compare(Map.Entry o1, + Map.Entry o2) { + return o1.getKey().compareTo(o2.getKey()); + } + } + + /** + * Increment the State object with o as the key, and i as the count. + */ + private static void increment(State s, LongWritable o, long i) { + if (s.counts == null) { + s.counts = new HashMap(); + } + LongWritable count = s.counts.get(o); + if (count == null) { + // We have to create a new object, because the object o belongs + // to the code that creates it and may get its value changed. + LongWritable key = new LongWritable(); + key.set(o.get()); + s.counts.put(key, new LongWritable(i)); + } else { + count.set(count.get() + i); + } + } + + /** + * Get the percentile value. + */ + private static double getPercentile(List> entriesList, + double position) { + // We may need to do linear interpolation to get the exact percentile + long lower = (long)Math.floor(position); + long higher = (long)Math.ceil(position); + + // Linear search since this won't take much time from the total execution anyway + // lower has the range of [0 .. total-1] + // The first entry with accumulated count (lower+1) corresponds to the lower position. + int i = 0; + while (entriesList.get(i).getValue().get() < lower + 1) { + i++; + } + + long lowerKey = entriesList.get(i).getKey().get(); + if (higher == lower) { + // no interpolation needed because position does not have a fraction + return lowerKey; + } + + if (entriesList.get(i).getValue().get() < higher + 1) { + i++; + } + long higherKey = entriesList.get(i).getKey().get(); + + if (higherKey == lowerKey) { + // no interpolation needed because lower position and higher position has the same key + return lowerKey; + } + + // Linear interpolation to get the exact percentile + return (higher - position) * lowerKey + (position - lower) * higherKey; + } + + + /** + * The evaluator for percentile computation based on long. + */ + public static class PercentileLongEvaluator implements UDAFEvaluator { + + private State state; + + public PercentileLongEvaluator() { + state = new State(); + } + + public void init() { + if (state.counts != null) { + state.counts.clear(); + } + } + + public boolean iterate(LongWritable o, double percentile) { + if (state.percentiles == null) { + state.percentiles = new ArrayList(1); + state.percentiles.add(new DoubleWritable(percentile)); + } + if (o != null) { + increment(state, o, 1); + } + return true; + } + + public State terminatePartial() { + return state; + } + + public boolean merge(State other) { + if (state.percentiles == null) { + state.percentiles = new ArrayList(other.percentiles); + } + for (Map.Entry e: other.counts.entrySet()) { + increment(state, e.getKey(), e.getValue().get()); + } + return true; + } + + private DoubleWritable result; + + public DoubleWritable terminate() { + // No input data. + if (state.counts == null) { + return null; + } + + // Get all items into an array and sort them. + Set> entries = state.counts.entrySet(); + List> entriesList = + new ArrayList>(entries); + Collections.sort(entriesList, new MyComparator()); + + // Accumulate the counts. + long total = 0; + for (int i = 0; i < entriesList.size(); i++) { + LongWritable count = entriesList.get(i).getValue(); + total += count.get(); + count.set(total); + } + + // Initialize the result. + if (result == null) { + result = new DoubleWritable(); + } + + // maxPosition is the 1.0 percentile + long maxPosition = total - 1; + double position = maxPosition * state.percentiles.get(0).get(); + result.set(getPercentile(entriesList, position)); + return result; + } + } + + /** + * The evaluator for percentile computation based on long for an array of percentiles. + */ + public static class PercentileLongArrayEvaluator implements UDAFEvaluator { + + private State state; + + public PercentileLongArrayEvaluator() { + state = new State(); + } + + public void init() { + if (state.counts != null) { + state.counts.clear(); + } + } + + public boolean iterate(LongWritable o, List percentiles) { + if (state.percentiles == null) { + state.percentiles = new ArrayList(percentiles); + } + if (o != null) { + increment(state, o, 1); + } + return true; + } + + public State terminatePartial() { + return state; + } + + public boolean merge(State other) { + if (state.percentiles == null) { + state.percentiles = new ArrayList(other.percentiles); + } + for (Map.Entry e: other.counts.entrySet()) { + increment(state, e.getKey(), e.getValue().get()); + } + return true; + } + + + private List results; + + public List terminate() { + // No input data + if (state.counts == null) { + return null; + } + + // Get all items into an array and sort them + Set> entries = state.counts.entrySet(); + List> entriesList = + new ArrayList>(entries); + Collections.sort(entriesList, new MyComparator()); + + // accumulate the counts + long total = 0; + for (int i = 0; i < entriesList.size(); i++) { + LongWritable count = entriesList.get(i).getValue(); + total += count.get(); + count.set(total); + } + + // maxPosition is the 1.0 percentile + long maxPosition = total - 1; + + // Initialize the results + if (results == null) { + results = new ArrayList(); + for (int i = 0; i < state.percentiles.size(); i++) { + results.add(new DoubleWritable()); + } + } + // Set the results + for (int i = 0; i < state.percentiles.size(); i++) { + double position = maxPosition * state.percentiles.get(i).get(); + results.get(i).set(getPercentile(entriesList, position)); + } + return results; + } + } + +}