Index: contrib/src/test/results/clientpositive/udaf_example_min_n.q.out =================================================================== --- contrib/src/test/results/clientpositive/udaf_example_min_n.q.out (revision 0) +++ contrib/src/test/results/clientpositive/udaf_example_min_n.q.out (revision 0) @@ -0,0 +1,92 @@ +PREHOOK: query: CREATE TEMPORARY FUNCTION example_min_n AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFMinN' +PREHOOK: type: CREATEFUNCTION +POSTHOOK: query: CREATE TEMPORARY FUNCTION example_min_n AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFMinN' +POSTHOOK: type: CREATEFUNCTION +PREHOOK: query: EXPLAIN +SELECT example_min_n(substr(value,5),10), + example_min_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT example_min_n(substr(value,5),10), + example_min_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION example_min_n (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 5) 10)) (TOK_SELEXPR (TOK_FUNCTION example_min_n (TOK_FUNCTION IF (> (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 5) 250) TOK_NULL (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 5)) 10))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Group By Operator + aggregations: + expr: example_min_n(substr(value, 5), 10) + expr: example_min_n(if((substr(value, 5) > 250), null, substr(value, 5)), 10) + bucketGroup: false + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: struct,n:int> + expr: _col1 + type: struct,n:int> + Reduce Operator Tree: + Group By Operator + aggregations: + expr: example_min_n(VALUE._col0) + expr: example_min_n(VALUE._col1) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: array + expr: _col1 + type: array + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT example_min_n(substr(value,5),10), + example_min_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/home/bjia/trunk/VENDOR.hive/trunk/build/ql/tmp/1955309834/10000 +POSTHOOK: query: SELECT example_min_n(substr(value,5),10), + example_min_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/home/bjia/trunk/VENDOR.hive/trunk/build/ql/tmp/1955309834/10000 +[0.0,0.0,0.0,2.0,4.0,5.0,5.0,5.0,8.0,9.0] [0.0,0.0,0.0,2.0,4.0,5.0,5.0,5.0,8.0,9.0] +PREHOOK: query: DROP TEMPORARY FUNCTION example_min_n +PREHOOK: type: DROPFUNCTION +POSTHOOK: query: DROP TEMPORARY FUNCTION example_min_n +POSTHOOK: type: DROPFUNCTION Index: contrib/src/test/results/clientpositive/udaf_example_max_n.q.out =================================================================== --- contrib/src/test/results/clientpositive/udaf_example_max_n.q.out (revision 0) +++ contrib/src/test/results/clientpositive/udaf_example_max_n.q.out (revision 0) @@ -0,0 +1,92 @@ +PREHOOK: query: CREATE TEMPORARY FUNCTION example_max_n AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFMaxN' +PREHOOK: type: CREATEFUNCTION +POSTHOOK: query: CREATE TEMPORARY FUNCTION example_max_n AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFMaxN' +POSTHOOK: type: CREATEFUNCTION +PREHOOK: query: EXPLAIN +SELECT example_max_n(substr(value,5),10), + example_max_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT example_max_n(substr(value,5),10), + example_max_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION example_max_n (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 5) 10)) (TOK_SELEXPR (TOK_FUNCTION example_max_n (TOK_FUNCTION IF (> (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 5) 250) TOK_NULL (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 5)) 10))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + src + TableScan + alias: src + Select Operator + expressions: + expr: value + type: string + outputColumnNames: value + Group By Operator + aggregations: + expr: example_max_n(substr(value, 5), 10) + expr: example_max_n(if((substr(value, 5) > 250), null, substr(value, 5)), 10) + bucketGroup: false + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: struct,n:int> + expr: _col1 + type: struct,n:int> + Reduce Operator Tree: + Group By Operator + aggregations: + expr: example_max_n(VALUE._col0) + expr: example_max_n(VALUE._col1) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: array + expr: _col1 + type: array + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT example_max_n(substr(value,5),10), + example_max_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/home/bjia/trunk/VENDOR.hive/trunk/build/ql/tmp/1599831192/10000 +POSTHOOK: query: SELECT example_max_n(substr(value,5),10), + example_max_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/home/bjia/trunk/VENDOR.hive/trunk/build/ql/tmp/1599831192/10000 +[498.0,498.0,498.0,497.0,496.0,495.0,494.0,493.0,492.0,492.0] [249.0,248.0,247.0,244.0,242.0,242.0,241.0,239.0,239.0,238.0] +PREHOOK: query: DROP TEMPORARY FUNCTION example_max_n +PREHOOK: type: DROPFUNCTION +POSTHOOK: query: DROP TEMPORARY FUNCTION example_max_n +POSTHOOK: type: DROPFUNCTION Index: contrib/src/test/queries/clientpositive/udaf_example_min_n.q =================================================================== --- contrib/src/test/queries/clientpositive/udaf_example_min_n.q (revision 0) +++ contrib/src/test/queries/clientpositive/udaf_example_min_n.q (revision 0) @@ -0,0 +1,14 @@ +add jar ../build/contrib/hive_contrib.jar; + +CREATE TEMPORARY FUNCTION example_min_n AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFMinN'; + +EXPLAIN +SELECT example_min_n(substr(value,5),10), + example_min_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src; + +SELECT example_min_n(substr(value,5),10), + example_min_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src; + +DROP TEMPORARY FUNCTION example_min_n; Index: contrib/src/test/queries/clientpositive/udaf_example_max_n.q =================================================================== --- contrib/src/test/queries/clientpositive/udaf_example_max_n.q (revision 0) +++ contrib/src/test/queries/clientpositive/udaf_example_max_n.q (revision 0) @@ -0,0 +1,14 @@ +add jar ../build/contrib/hive_contrib.jar; + +CREATE TEMPORARY FUNCTION example_max_n AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFMaxN'; + +EXPLAIN +SELECT example_max_n(substr(value,5),10), + example_max_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src; + +SELECT example_max_n(substr(value,5),10), + example_max_n(IF(substr(value,5) > 250, NULL, substr(value,5)),10) +FROM src; + +DROP TEMPORARY FUNCTION example_max_n; Index: contrib/src/java/org/apache/hadoop/hive/contrib/udaf/example/UDAFMinN.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/contrib/udaf/example/UDAFMinN.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/contrib/udaf/example/UDAFMinN.java (revision 0) @@ -0,0 +1,145 @@ +package org.apache.hadoop.hive.contrib.udaf.example; + +import org.apache.hadoop.hive.ql.exec.UDAF; +import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; +import java.util.*; + +public class UDAFMinN extends UDAF{ + + /** + * Hive will automatically look for all internal classes of the UDAF + * that implements UDAFEvaluator. + */ + public static class UDAFMinNState { + private ArrayList MinNArray; //This ArrrayList holds the max N + private int N; // This is the N + } + + public static class UDAFMinNEvaluator implements UDAFEvaluator { + + UDAFMinNState state; //Define a state + + public UDAFMinNEvaluator() { + super(); + state = new UDAFMinNState (); + init(); + } + + /** + * Reset the state. + */ + public void init() { + state.MinNArray = new ArrayList (); + state.N = 0; + } + + /** + * Iterate through one row of original data. + * + * The number and type of arguments need to the same as we call + * this UDAF from Hive command line. + * + * This function should always return true. + * This function fills MinNArray with one element and then adds following elements by binary insertion. + * When MinNArray has a size of N and the following element is greater than the smallest element in MinNArray, insert it and remove the smallest element in MinNArray. + */ + public boolean iterate(Double o, int MinN) { + state.N = MinN; + if (o != null) { + if (state.MinNArray.size() >=MinN && o>state.MinNArray.get(state.MinNArray.size()-1)) + { + //doing nothing. o is greater than the greatest element in MinNArray. + } + else // if MinNArray is empty, add the first element in. Otherwise, do a binary insertion. + { + int first = 0; + int last = state.MinNArray.size()-1; + int mid = 0; + int iState = 0; + while(first <= last) + { + mid = (first + last)/2; + if(o < state.MinNArray.get(mid)) + { + last = mid -1; + iState = 0; + } + else + { + first = mid+1; + iState = 1; + } + } + state.MinNArray.add(mid+iState, o); + + if(state.MinNArray.size()>MinN) + { + state.MinNArray.remove(state.MinNArray.size()-1); + } + + } + } + return true; + } + + public UDAFMinNState terminatePartial() { + // This is SQL standard - min_n of zero items should be null. + return state.MinNArray.size() == 0 ? null : state; + } + + /* Two pointers are created to track the minimal elements in both o and MinNArray + * The smallest element is added into tempArrayList + * Consider the sizes of o and MinNArray may be different. + * + */ + + + public boolean merge(UDAFMinNState o) { + + if (o != null) { + int n1 = o.MinNArray.size(); //get the size of o + int n2 = state.MinNArray.size(); //get the size of MinNArray ; + int pnt_o = 0; //pointer for o + int pnt_MinNArray = 0; //pointer for MinNArray + ArrayList tempArrayList = new ArrayList (); //Create a temp ArrayList to hold the N min elements. + for (int i = 0; i o.MinNArray.get(pnt_o)) + { + tempArrayList.add(i,o.MinNArray.get(pnt_o)) ; //Add the largest element in o into tempArrayList; + pnt_o++; + } + else if(pnt_MinNArray == n2 && pnt_o < n1) //This will only happen if MinNArray and o have different sizes; + { + tempArrayList.add(i,o.MinNArray.get(pnt_o)); + pnt_o++; + } + else if(pnt_o==n1 && pnt_MinNArray terminate() { + // This is SQL standard - return state.MinNArray, or null if the size is zero. + + return state.MinNArray.size() == 0 ? null : state.MinNArray; + } + } + + +} + Index: contrib/src/java/org/apache/hadoop/hive/contrib/udaf/example/UDAFMaxN.java =================================================================== --- contrib/src/java/org/apache/hadoop/hive/contrib/udaf/example/UDAFMaxN.java (revision 0) +++ contrib/src/java/org/apache/hadoop/hive/contrib/udaf/example/UDAFMaxN.java (revision 0) @@ -0,0 +1,146 @@ +package org.apache.hadoop.hive.contrib.udaf.example; + +import org.apache.hadoop.hive.ql.exec.UDAF; +import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; +import java.util.*; + +public class UDAFMaxN extends UDAF{ + + /** + * Hive will automatically look for all internal classes of the UDAF + * that implements UDAFEvaluator. + */ + public static class UDAFMaxNState { + private ArrayList maxNArray; //This ArrrayList holds the max N + private int N; // This is the N + } + + public static class UDAFMaxNEvaluator implements UDAFEvaluator { + + UDAFMaxNState state; //Define a state + + public UDAFMaxNEvaluator() { + super(); + state = new UDAFMaxNState (); + init(); + } + + /** + * Reset the state. + */ + public void init() { + state.maxNArray = new ArrayList (); + state.N = 0; + } + + /** + * Iterate through one row of original data. + * + * The number and type of arguments need to the same as we call + * this UDAF from Hive command line. + * + * This function should always return true. + * This function fills maxNArray with one element and then adds following elements by binary insertion. + * When maxNArray has a size of N and the following element is greater than the smallest element in maxNArray, insert it and remove the smallest element in maxNArray. + */ + public boolean iterate(Double o, int maxN) { + state.N = maxN; + if (o != null) { + if (state.maxNArray.size() >=maxN && o state.maxNArray.get(mid)) + { + last = mid -1; + iState = 0; + } + else + { + first = mid+1; + iState = 1; + } + } + state.maxNArray.add(mid+iState, o); + + if(state.maxNArray.size()>maxN) + { + state.maxNArray.remove(state.maxNArray.size()-1); + } + + } + } + return true; + } + + public UDAFMaxNState terminatePartial() { + // This is SQL standard - max_n of zero items should be null. + return state.maxNArray.size() == 0 ? null : state; + } + + /* Two pointers are created to track the maximal elements in both o and maxNArray + * The largest element is added into tempArrayList + * Consider the sizes of o and maxNArray may be different. + * + */ + + + public boolean merge(UDAFMaxNState o) { + + if (o != null) { + int n1 = o.maxNArray.size(); //get the size of o + int n2 = state.maxNArray.size(); //get the size of maxNArray ; + int pnt_o = 0; //pointer for o + int pnt_maxNArray = 0; //pointer for maxNArray + ArrayList tempArrayList = new ArrayList (); //Create a temp ArrayList to hold the N max elements. + for (int i = 0; i= o.maxNArray.get(pnt_o)) //if the largest element in maxNArray is greater than or equal to the largest element in o; + { + tempArrayList.add(i,state.maxNArray.get(pnt_maxNArray)); //Add the largest element in maxNArray into tempArrayList. + pnt_maxNArray++; //Shift pointer for maxNArray. + } + else if(pnt_o <= n1-1 && pnt_maxNArray <= n2-1 && state.maxNArray.get(pnt_maxNArray)< o.maxNArray.get(pnt_o)) + { + tempArrayList.add(i,o.maxNArray.get(pnt_o)) ; //Add the largest element in o into tempArrayList; + pnt_o++; + } + else if(pnt_maxNArray == n2 && pnt_o < n1) //This will only happen if maxNArray and o have different sizes; + { + tempArrayList.add(i,o.maxNArray.get(pnt_o)); + pnt_o++; + } + else if(pnt_o==n1 && pnt_maxNArray terminate() { + // This is SQL standard - return state.maxNArray, or null if the size is zero. + + return state.maxNArray.size() == 0 ? null : state.maxNArray; + } + } + + +} + Property changes on: contrib/src/java/org/apache/hadoop/hive/contrib/udaf/example/UDAFMaxN.java ___________________________________________________________________ Added: svn:executable + *