Index: ql/src/test/results/clientpositive/udf_length.q.out =================================================================== --- ql/src/test/results/clientpositive/udf_length.q.out (revision 793661) +++ ql/src/test/results/clientpositive/udf_length.q.out (working copy) @@ -1,8 +1,7 @@ query: CREATE TABLE dest1(len INT) -query: EXPLAIN -FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value) +query: EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value) ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION length (. (TOK_TABLE_OR_COL src) value)))))) + (TOK_QUERY (TOK_FROM (TOK_TABREF src1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION length (. (TOK_TABLE_OR_COL src1) value)))))) STAGE DEPENDENCIES: Stage-1 is a root stage @@ -13,7 +12,7 @@ Stage: Stage-1 Map Reduce Alias -> Map Operator Tree: - src + src1 Select Operator expressions: expr: length(value) @@ -33,10 +32,10 @@ Move Operator files: hdfs directory: true - destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/809193894/10000 + destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/891633093/10000 Map Reduce Alias -> Map Operator Tree: - file:/data/users/emil/hive1/hive1/build/ql/tmp/353629462/10002 + file:/data/users/emil/hive1/hive1/build/ql/tmp/988560065/10002 Reduce Output Operator sort order: Map-reduce partition columns: @@ -68,14 +67,14 @@ name: dest1 -query: FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value) -Input: default/src +query: FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value) +Input: default/src1 Output: default/dest1 query: SELECT dest1.* FROM dest1 Input: default/dest1 -Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1493924198/10000 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/469108904/10000 7 -6 +0 7 6 7 @@ -89,489 +88,51 @@ 7 7 7 -7 -7 +0 +0 6 +0 7 7 7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -6 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -5 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -6 -7 -6 -6 -5 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -6 -7 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -6 -5 -7 -7 -7 -7 -6 -7 -7 -7 -7 -6 -7 -7 -7 -7 -5 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -6 -7 -7 -7 -7 -7 -7 -6 -7 -6 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -6 -5 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -6 -6 -7 -6 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -5 -7 -7 -7 -7 -7 -6 -6 -7 -6 -6 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -6 -6 -6 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -6 -7 -6 -6 -7 -6 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -5 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -5 -6 -7 -7 -7 -6 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -5 -6 -7 -7 -7 -6 -7 -7 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -6 -7 -7 -6 -7 -7 -7 -7 -5 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -6 -7 -7 -6 -6 -6 -7 -7 -7 -7 -7 -7 -7 -7 -7 -6 -7 -7 -7 -7 -6 +0 +0 +0 query: DROP TABLE dest1 +query: -- Test with non-ascii characters. +CREATE TABLE dest1(name STRING) STORED AS TEXTFILE +query: LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1 +query: EXPLAIN SELECT length(dest1.name) FROM dest1 +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF dest1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION length (. (TOK_TABLE_OR_COL dest1) name)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + dest1 + Select Operator + expressions: + expr: length(name) + type: int + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +query: SELECT length(dest1.name) FROM dest1 +Input: default/dest1 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1552330536/10000 +2 +query: DROP TABLE dest1 Index: ql/src/test/results/clientpositive/udf_reverse.q.out =================================================================== --- ql/src/test/results/clientpositive/udf_reverse.q.out (revision 793661) +++ ql/src/test/results/clientpositive/udf_reverse.q.out (working copy) @@ -1,6 +1,5 @@ query: CREATE TABLE dest1(len STRING) -query: EXPLAIN -FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value) +query: EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value) ABSTRACT SYNTAX TREE: (TOK_QUERY (TOK_FROM (TOK_TABREF src1)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION reverse (. (TOK_TABLE_OR_COL src1) value)))))) @@ -33,10 +32,10 @@ Move Operator files: hdfs directory: true - destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/533825523/10000 + destination: file:/data/users/emil/hive1/hive1/build/ql/tmp/28581454/10000 Map Reduce Alias -> Map Operator Tree: - file:/data/users/emil/hive1/hive1/build/ql/tmp/620216694/10002 + file:/data/users/emil/hive1/hive1/build/ql/tmp/883276693/10002 Reduce Output Operator sort order: Map-reduce partition columns: @@ -73,7 +72,7 @@ Output: default/dest1 query: SELECT dest1.* FROM dest1 Input: default/dest1 -Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1181513979/10000 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/68123585/10000 832_lav 113_lav @@ -100,3 +99,40 @@ query: DROP TABLE dest1 +query: -- Test with non-ascii characters +CREATE TABLE dest1(name STRING) STORED AS TEXTFILE +query: LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1 +query: EXPLAIN SELECT reverse(dest1.name) FROM dest1 +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF dest1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION reverse (. (TOK_TABLE_OR_COL dest1) name)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + dest1 + Select Operator + expressions: + expr: reverse(name) + type: string + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +query: SELECT reverse(dest1.name) FROM dest1 +Input: default/dest1 +Output: file:/data/users/emil/hive1/hive1/build/ql/tmp/1873922215/10000 +铮邵 +query: DROP TABLE dest1 Index: ql/src/test/queries/clientpositive/udf_length.q =================================================================== --- ql/src/test/queries/clientpositive/udf_length.q (revision 793661) +++ ql/src/test/queries/clientpositive/udf_length.q (working copy) @@ -1,10 +1,12 @@ CREATE TABLE dest1(len INT); - -EXPLAIN -FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value); - -FROM src INSERT OVERWRITE TABLE dest1 SELECT length(src.value); - +EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value); +FROM src1 INSERT OVERWRITE TABLE dest1 SELECT length(src1.value); SELECT dest1.* FROM dest1; +DROP TABLE dest1; +-- Test with non-ascii characters. +CREATE TABLE dest1(name STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1; +EXPLAIN SELECT length(dest1.name) FROM dest1; +SELECT length(dest1.name) FROM dest1; DROP TABLE dest1; Index: ql/src/test/queries/clientpositive/udf_reverse.q =================================================================== --- ql/src/test/queries/clientpositive/udf_reverse.q (revision 793661) +++ ql/src/test/queries/clientpositive/udf_reverse.q (working copy) @@ -1,10 +1,12 @@ CREATE TABLE dest1(len STRING); - -EXPLAIN +EXPLAIN FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value); FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value); - -FROM src1 INSERT OVERWRITE TABLE dest1 SELECT reverse(src1.value); - SELECT dest1.* FROM dest1; +DROP TABLE dest1; +-- Test with non-ascii characters +CREATE TABLE dest1(name STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '../data/files/kv4.txt' INTO TABLE dest1; +EXPLAIN SELECT reverse(dest1.name) FROM dest1; +SELECT reverse(dest1.name) FROM dest1; DROP TABLE dest1; Index: ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java (revision 793661) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDFReverse.java (working copy) @@ -24,23 +24,42 @@ public class UDFReverse extends UDF { private Text result = new Text(); - public Text evaluate(Text s) { + /** + * Reverse a portion of an array in-place. + * + * @param arr The array where the data will be reversed. + * @param first The beginning of the portion (inclusive). + * @param last The end of the portion (inclusive). + */ + private void reverse(byte[] arr, int first, int last) { + for(int i = 0; i < (last-first+1)/2; i++) { + byte temp = arr[last - i]; + arr[last - i] = arr[first + i]; + arr[first + i] = temp; + } + } + + public Text evaluate(Text s) { if (s == null) { return null; } - - // Use a string because Text.getLength() returns the number of bytes. - // This can be optimized by walking over the utf8 characters and not - // creating a string at all. - String text = s.toString(); - // Append the text to a StringBuffer in reverse order. - StringBuffer revBuff = new StringBuffer(); - for (int i = text.length() - 1; i >= 0; i--) { - revBuff.append(text.charAt(i)); + // set() will only allocate memory if the buffer of result is smaller than + // s.getLength() and will never resize the buffer down. + result.set(s); + + // Now do an in-place reversal in result.getBytes(). First, reverse every + // character, then reverse the whole string. + int prev = 0; // The index where the current char starts + for(int i = 1; i < result.getLength(); i++) { + if(result.charAt(i) != -1) { + reverse(result.getBytes(), prev, i-1); + prev = i; + } } + reverse(result.getBytes(), prev, result.getLength() - 1); + reverse(result.getBytes(), 0, result.getLength() - 1); - result.set(revBuff.toString()); return result; } } Index: ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java (revision 793661) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLength.java (working copy) @@ -29,7 +29,14 @@ return null; } - result.set(s.toString().length()); + int len = 0; + for(int i = 0; i < s.getLength(); i++) { + if(s.charAt(i) != -1) { // Don't count UTF trailing bytes + len++; + } + } + + result.set(len); return result; } }