Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java (revision 28300) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java (working copy) @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Arrays; +import java.util.Collections; import java.util.Random; import org.apache.hadoop.hive.serde2.io.DoubleWritable; @@ -55,7 +56,7 @@ // Class variables private int nbins; private int nusedbins; - private Coord[] bins; + private ArrayList bins; private Random prng; /** @@ -101,7 +102,7 @@ * Returns a particular histogram bin. */ public Coord getBin(int b) { - return bins[b]; + return bins.get(b); } /** @@ -111,10 +112,7 @@ */ public void allocate(int num_bins) { nbins = num_bins; - bins = new Coord[nbins+1]; - for(int i = 0; i < nbins+1; i++) { - bins[i] = new Coord(); - } + bins = new ArrayList(); nusedbins = 0; } @@ -135,31 +133,32 @@ // by deserializing the ArrayList of (x,y) pairs into an array of Coord objects nbins = (int) other.get(0).get(); nusedbins = (other.size()-1)/2; - bins = new Coord[nbins+1]; // +1 to hold a temporary bin for insert() - for(int i = 1; i < other.size(); i+=2) { - bins[(i-1)/2] = new Coord(); - bins[(i-1)/2].x = other.get(i).get(); - bins[(i-1)/2].y = other.get(i+1).get(); + bins = new ArrayList(nusedbins); + for (int i = 1; i < other.size(); i+=2) { + Coord bin = new Coord(); + bin.x = other.get(i).get(); + bin.y = other.get(i+1).get(); + bins.add(bin); } } else { // The aggregation buffer already contains a partial histogram. Therefore, we need // to merge histograms using Algorithm #2 from the Ben-Haim and Tom-Tov paper. - Coord[] tmp_bins = new Coord[nusedbins + (other.size()-1)/2]; - for(int j = 0; j < tmp_bins.length; j++) { - tmp_bins[j] = new Coord(); - } + ArrayList tmp_bins = new ArrayList(nusedbins + (other.size()-1)/2); // Copy all the histogram bins from us and 'other' into an overstuffed histogram - int i; - for(i = 0; i < nusedbins; i++) { - tmp_bins[i].x = bins[i].x; - tmp_bins[i].y = bins[i].y; + for (int i = 0; i < nusedbins; i++) { + Coord bin = new Coord(); + bin.x = bins.get(i).x; + bin.y = bins.get(i).y; + tmp_bins.add(bin); } - for(int j = 1; j < other.size(); j+=2, i++) { - tmp_bins[i].x = other.get(j).get(); - tmp_bins[i].y = other.get(j+1).get(); + for (int j = 1; j < other.size(); j += 2) { + Coord bin = new Coord(); + bin.x = other.get(j).get(); + bin.y = other.get(j+1).get(); + tmp_bins.add(bin); } - Arrays.sort(tmp_bins); + Collections.sort(tmp_bins); // Now trim the overstuffed histogram down to the correct number of bins bins = tmp_bins; @@ -185,10 +184,10 @@ int bin = 0; for(int l=0, r=nusedbins; l < r; ) { bin = (l+r)/2; - if(bins[bin].x > v) { + if (bins.get(bin).x > v) { r = bin; } else { - if(bins[bin].x < v) { + if (bins.get(bin).x < v) { l = ++bin; } else { break; // break loop on equal comparator @@ -202,21 +201,26 @@ // assumed to be equal to the closest bin -- if fabs(v-bins[bin].x) < THRESHOLD, then // just increment 'bin'. This is not done now because we don't want to make any // assumptions about the range of numeric data being analyzed. - if(bin < nusedbins && bins[bin].x == v) { - bins[bin].y++; + if (bin < nusedbins && bins.get(bin).x == v) { + bins.get(bin).y++; } else { - for(int i = nusedbins; i > bin; i--) { - bins[i].x = bins[i-1].x; - bins[i].y = bins[i-1].y; + Coord newBin = new Coord(); + newBin.x = v; + newBin.y = 1; + if (bins.size() == nusedbins) { + bins.add(newBin); } - bins[bin].x = v; // new bins bin for value 'v' - bins[bin].y = 1; // of height 1 unit + for (int i = nusedbins; i > bin; i--) { + bins.set(i, bins.get(i-1)); + } + bins.set(bin, newBin); // Trim the bins down to the correct number of bins. - if(++nusedbins > nbins) { + if (++nusedbins > nbins) { trim(); } } + } /** @@ -227,10 +231,10 @@ private void trim() { while(nusedbins > nbins) { // Find the closest pair of bins in terms of x coordinates. Break ties randomly. - double smallestdiff = bins[1].x - bins[0].x; + double smallestdiff = bins.get(1).x - bins.get(0).x; int smallestdiffloc = 0, smallestdiffcount = 1; for(int i = 1; i < nusedbins-1; i++) { - double diff = bins[i+1].x - bins[i].x; + double diff = bins.get(i+1).x - bins.get(i).x; if(diff < smallestdiff) { smallestdiff = diff; smallestdiffloc = i; @@ -244,16 +248,20 @@ // Merge the two closest bins into their average x location, weighted by their heights. // The height of the new bin is the sum of the heights of the old bins. - double d = bins[smallestdiffloc].y + bins[smallestdiffloc+1].y; - bins[smallestdiffloc].x *= bins[smallestdiffloc].y / d; - bins[smallestdiffloc].x += bins[smallestdiffloc+1].x / d * - bins[smallestdiffloc+1].y; - bins[smallestdiffloc].y = d; + // double d = bins[smallestdiffloc].y + bins[smallestdiffloc+1].y; + // bins[smallestdiffloc].x *= bins[smallestdiffloc].y / d; + // bins[smallestdiffloc].x += bins[smallestdiffloc+1].x / d * + // bins[smallestdiffloc+1].y; + // bins[smallestdiffloc].y = d; + double d = bins.get(smallestdiffloc).y + bins.get(smallestdiffloc+1).y; + Coord smallestdiffbin = bins.get(smallestdiffloc); + smallestdiffbin.x *= smallestdiffbin.y / d; + smallestdiffbin.x += bins.get(smallestdiffloc+1).x / d * bins.get(smallestdiffloc+1).y; + smallestdiffbin.y = d; // Shift the remaining bins left one position for(int i = smallestdiffloc+1; i < nusedbins-1; i++) { - bins[i].x = bins[i+1].x; - bins[i].y = bins[i+1].y; + bins.set(i, bins.get(i+1)); } nusedbins--; } @@ -271,16 +279,17 @@ double sum = 0, csum = 0; int b; for(b = 0; b < nusedbins; b++) { - sum += bins[b].y; + sum += bins.get(b).y; } for(b = 0; b < nusedbins; b++) { - csum += bins[b].y; + csum += bins.get(b).y; if(csum / sum >= q) { if(b == 0) { - return bins[b].x; + return bins.get(b).x; } - csum -= bins[b].y; - double r = bins[b-1].x + (q*sum - csum) * (bins[b].x-bins[b-1].x)/(bins[b].y); + + csum -= bins.get(b).y; + double r = bins.get(b-1).x + (q*sum - csum) * (bins.get(b).x - bins.get(b-1).x)/(bins.get(b).y); return r; } } @@ -304,8 +313,8 @@ result.add(new DoubleWritable(nbins)); if(bins != null) { for(int i = 0; i < nusedbins; i++) { - result.add(new DoubleWritable(bins[i].x)); - result.add(new DoubleWritable(bins[i].y)); + result.add(new DoubleWritable(bins.get(i).x)); + result.add(new DoubleWritable(bins.get(i).y)); } }