Index: ql/src/test/results/clientpositive/udaf_percentile_approx.q.out =================================================================== --- ql/src/test/results/clientpositive/udaf_percentile_approx.q.out (revision 979434) +++ ql/src/test/results/clientpositive/udaf_percentile_approx.q.out (working copy) @@ -1,108 +1,99 @@ -PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5) FROM src -PREHOOK: type: QUERY -PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-37_073_6076210560386322054/10000 -POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5) FROM src -POSTHOOK: type: QUERY -POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-37_073_6076210560386322054/10000 -255.5 PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5, 100) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-41_860_8275933713801449697/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-04_416_834356939093099091/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5, 100) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-41_860_8275933713801449697/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-04_416_834356939093099091/10000 252.77777777777777 PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5, 1000) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-46_334_9017578099479193081/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-09_347_2669488651086861546/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5, 1000) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-46_334_9017578099479193081/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-09_347_2669488651086861546/10000 255.5 PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), 0.5) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-51_723_1654864352248105322/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-14_077_78739060255077214/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), 0.5) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-51_723_1654864352248105322/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-14_077_78739060255077214/10000 255.5 PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), 0.5, 100) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-56_385_7799592520279056727/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-18_671_141245215173795995/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), 0.5, 100) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-43-56_385_7799592520279056727/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-18_671_141245215173795995/10000 252.77777777777777 PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), 0.5, 1000) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-00_946_3923666334878603072/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-23_266_8933622767986547541/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), 0.5, 1000) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-00_946_3923666334878603072/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-23_266_8933622767986547541/10000 255.5 PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), array(0.05,0.5,0.95,0.98)) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-05_574_6679873308435909842/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-27_803_3529609176449051766/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), array(0.05,0.5,0.95,0.98)) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-05_574_6679873308435909842/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-27_803_3529609176449051766/10000 [26.0,255.5,479.0,491.0] PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), array(0.05,0.5,0.95,0.98), 100) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-10_350_1432232531700471217/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-32_749_7874714977417965677/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), array(0.05,0.5,0.95,0.98), 100) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-10_350_1432232531700471217/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-32_749_7874714977417965677/10000 [24.07,252.77777777777777,476.9444444444444,487.82] PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), array(0.05,0.5,0.95,0.98), 1000) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-15_355_3332753751548479422/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-37_425_1289670844459308120/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS double), array(0.05,0.5,0.95,0.98), 1000) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-15_355_3332753751548479422/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-37_425_1289670844459308120/10000 [26.0,255.5,479.0,491.0] PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), array(0.05,0.5,0.95,0.98)) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-19_897_625588204506780432/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-42_578_6317460559415319067/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), array(0.05,0.5,0.95,0.98)) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-19_897_625588204506780432/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-42_578_6317460559415319067/10000 [26.0,255.5,479.0,491.0] PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), array(0.05,0.5,0.95,0.98), 100) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-24_513_3382890303984991141/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-47_287_2752385889854385296/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), array(0.05,0.5,0.95,0.98), 100) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-24_513_3382890303984991141/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-47_287_2752385889854385296/10000 [24.07,252.77777777777777,476.9444444444444,487.82] PREHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), array(0.05,0.5,0.95,0.98), 1000) FROM src PREHOOK: type: QUERY PREHOOK: Input: default@src -PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-29_036_9025181836738993015/10000 +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-51_922_7991525101468087783/10000 POSTHOOK: query: SELECT percentile_approx(cast(substr(src.value,5) AS int), array(0.05,0.5,0.95,0.98), 1000) FROM src POSTHOOK: type: QUERY POSTHOOK: Input: default@src -POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-06-23_12-44-29_036_9025181836738993015/10000 +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_14-08-51_922_7991525101468087783/10000 [26.0,255.5,479.0,491.0] Index: ql/src/test/queries/clientpositive/udaf_percentile_approx.q =================================================================== --- ql/src/test/queries/clientpositive/udaf_percentile_approx.q (revision 979434) +++ ql/src/test/queries/clientpositive/udaf_percentile_approx.q (working copy) @@ -1,4 +1,6 @@ +set mapred.reduce.tasks=4 +set hive.exec.reducers.max=4 SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5) FROM src; SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5, 100) FROM src; SELECT percentile_approx(cast(substr(src.value,5) AS double), 0.5, 1000) FROM src; Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFPercentileApprox.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFPercentileApprox.java (revision 979434) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFPercentileApprox.java (working copy) @@ -65,7 +65,7 @@ "this function.\n" + "Example (three percentiles requested using a finer histogram approximation):\n" + "> SELECT percentile_approx(val, array(0.5, 0.95, 0.98), 100000) FROM somedata;\n" + - "[0.05,1.64,2.26]\n" ) + "[0.05,1.64,2.26]\n") public class GenericUDAFPercentileApprox implements GenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFPercentileApprox.class.getName()); @@ -98,6 +98,7 @@ } // Validate the second parameter, which is either a solitary double or an array of doubles. + boolean wantManyQuantiles = false; switch(parameters[1].getCategory()) { case PRIMITIVE: // Only a single double was passed as parameter 2, a single quantile is being requested @@ -132,6 +133,7 @@ "A float/double array argument may be passed as parameter 2, but " + parameters[1].getTypeName() + " was passed instead."); } + wantManyQuantiles = true; break; default: @@ -159,22 +161,17 @@ } } - return new GenericUDAFPercentileApproxEvaluator(); + // Return an evaluator depending on the return type + if(wantManyQuantiles) { + return new GenericUDAFMultiplePercentileApproxEvaluator(); + } else { + return new GenericUDAFSinglePercentileApproxEvaluator(); + } } + + public static class GenericUDAFSinglePercentileApproxEvaluator extends + GenericUDAFPercentileApproxEvaluator { - /** - * Construct a histogram using the algorithm described by Ben-Haim and Tom-Tov, and then - * use it to compute an approximate percentile value. - */ - public static class GenericUDAFPercentileApproxEvaluator extends GenericUDAFEvaluator { - // For PARTIAL1 and COMPLETE: ObjectInspectors for original data - private PrimitiveObjectInspector inputOI; - private ObjectInspector quantilesOI; - private PrimitiveObjectInspector nbinsOI; - - // For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations (list of doubles) - private StandardListObjectInspector loi; - @Override public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { super.init(m, parameters); @@ -196,20 +193,91 @@ // GenericUDAFHistogramNumeric, but we add on the percentile values requested to the // end, and handle serializing/deserializing before we pass things on to the parent // method. - // The return type for FINAL and COMPLETE is a full aggregation result, which is also a - // list of DoubleWritables with the requested quantile values. The only exception is - // when a single double, as opposed to an array of doubles, is passed as a parameter. In - // that case, just return a single double value. - if (m == Mode.PARTIAL1 || m == Mode.COMPLETE || - quantilesOI.getCategory() == ObjectInspector.Category.LIST) { + // The return type for FINAL and COMPLETE is a full aggregation result, which is a + // single double value + if (m == Mode.PARTIAL1 || m == Mode.PARTIAL2) { return ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); } else { return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; } } @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + PercentileAggBuf myagg = (PercentileAggBuf) agg; + + if (myagg.histogram.getUsedBins() < 1) { // SQL standard - return null for zero elements + return null; + } else { + assert(myagg.quantiles != null); + return new DoubleWritable(myagg.histogram.quantile(myagg.quantiles[0])); + } + } + } + + + public static class GenericUDAFMultiplePercentileApproxEvaluator extends + GenericUDAFPercentileApproxEvaluator { + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { + super.init(m, parameters); + + // init input object inspectors + if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + quantilesOI = parameters[1]; + if(parameters.length > 2) { + nbinsOI = (PrimitiveObjectInspector) parameters[2]; + } + } else { + loi = (StandardListObjectInspector) parameters[0]; + } + + // Init output object inspectors. + // + // The return type for a partial aggregation is still a list of doubles, as in + // GenericUDAFHistogramNumeric, but we add on the percentile values requested to the + // end, and handle serializing/deserializing before we pass things on to the parent + // method. + // The return type for FINAL and COMPLETE is a full aggregation result, which is also + // a list of doubles + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + PercentileAggBuf myagg = (PercentileAggBuf) agg; + + if (myagg.histogram.getUsedBins() < 1) { // SQL standard - return null for zero elements + return null; + } else { + ArrayList result = new ArrayList(); + assert(myagg.quantiles != null); + for(int i = 0; i < myagg.quantiles.length; i++) { + result.add(new DoubleWritable(myagg.histogram.quantile(myagg.quantiles[i]))); + } + return result; + } + } + } + + /** + * Construct a histogram using the algorithm described by Ben-Haim and Tom-Tov, and then + * use it to compute an approximate percentile value. + */ + public abstract static class GenericUDAFPercentileApproxEvaluator extends GenericUDAFEvaluator { + // For PARTIAL1 and COMPLETE: ObjectInspectors for original data + protected PrimitiveObjectInspector inputOI; + protected ObjectInspector quantilesOI; + protected PrimitiveObjectInspector nbinsOI; + + // For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations (list of doubles) + protected StandardListObjectInspector loi; + + @Override public void merge(AggregationBuffer agg, Object partial) throws HiveException { if(partial == null) { return; @@ -305,26 +373,6 @@ myagg.histogram.add(v); } - @Override - public Object terminate(AggregationBuffer agg) throws HiveException { - PercentileAggBuf myagg = (PercentileAggBuf) agg; - - if (myagg.histogram.getUsedBins() < 1) { // SQL standard - return null for zero elements - return null; - } else { - ArrayList result = new ArrayList(); - assert(myagg.quantiles != null); - for(int i = 0; i < myagg.quantiles.length; i++) { - result.add(new DoubleWritable(myagg.histogram.quantile(myagg.quantiles[i]))); - } - if(myagg.quantiles.length == 1) { - return result.get(0); - } else { - return result; - } - } - } - // Aggregation buffer methods. We wrap GenericUDAFHistogramNumeric's aggregation buffer // inside our own, so that we can also store requested quantile values between calls static class PercentileAggBuf implements AggregationBuffer {