diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java index 40ede1a..c65174e 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java @@ -555,6 +555,6 @@ public static Text transformTextToUTF8(Text text, Charset previousCharset) { } public static Text transformTextFromUTF8(Text text, Charset targetCharset) { - return new Text(new String(text.getBytes()).getBytes(targetCharset)); + return new Text(new String(text.getBytes(), 0, text.getLength()).getBytes(targetCharset)); } } diff --git a/data/files/encoding-utf8.txt b/data/files/encoding-utf8.txt new file mode 100644 index 0000000..88bd256 --- /dev/null +++ b/data/files/encoding-utf8.txt @@ -0,0 +1,12 @@ +Tao,Li +Wisgood +Benguo,Me +Xianqiang,Shen +Wensheng,Wang +Haijun,Qiao +Shilong,Zhang +Xiaoqing,You +Aiqing,Song +Zhenhua,Han +Weiqi,Peng +Hua,Li diff --git a/ql/src/test/queries/clientpositive/insert_non_utf8_encoding_table.q b/ql/src/test/queries/clientpositive/insert_non_utf8_encoding_table.q new file mode 100644 index 0000000..0f9db02 --- /dev/null +++ b/ql/src/test/queries/clientpositive/insert_non_utf8_encoding_table.q @@ -0,0 +1,20 @@ +drop table if exists table_with_utf8_encoding; + +create table table_with_utf8_encoding (name STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='utf-8'); + +load data local inpath '../../data/files/encoding-utf8.txt' overwrite into table table_with_utf8_encoding; + +select * from table_with_utf8_encoding; + +drop table if exists table_with_non_utf8_encoding; + +create table table_with_non_utf8_encoding (name STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1'); + +insert overwrite table table_with_non_utf8_encoding select name from table_with_utf8_encoding; + +select * from table_with_non_utf8_encoding; + diff --git a/ql/src/test/results/clientpositive/insert_non_utf8_encoding_table.q.out b/ql/src/test/results/clientpositive/insert_non_utf8_encoding_table.q.out new file mode 100644 index 0000000..4d97d87 --- /dev/null +++ b/ql/src/test/results/clientpositive/insert_non_utf8_encoding_table.q.out @@ -0,0 +1,89 @@ +PREHOOK: query: drop table if exists table_with_utf8_encoding +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists table_with_utf8_encoding +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table table_with_utf8_encoding (name STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='utf-8') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@table_with_utf8_encoding +POSTHOOK: query: create table table_with_utf8_encoding (name STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='utf-8') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_with_utf8_encoding +PREHOOK: query: load data local inpath '../../data/files/encoding-utf8.txt' overwrite into table table_with_utf8_encoding +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@table_with_utf8_encoding +POSTHOOK: query: load data local inpath '../../data/files/encoding-utf8.txt' overwrite into table table_with_utf8_encoding +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@table_with_utf8_encoding +PREHOOK: query: select * from table_with_utf8_encoding +PREHOOK: type: QUERY +PREHOOK: Input: default@table_with_utf8_encoding +#### A masked pattern was here #### +POSTHOOK: query: select * from table_with_utf8_encoding +POSTHOOK: type: QUERY +POSTHOOK: Input: default@table_with_utf8_encoding +#### A masked pattern was here #### +Tao,Li +Wisgood +Benguo,Me +Xianqiang,Shen +Wensheng,Wang +Haijun,Qiao +Shilong,Zhang +Xiaoqing,You +Aiqing,Song +Zhenhua,Han +Weiqi,Peng +Hua,Li +PREHOOK: query: drop table if exists table_with_non_utf8_encoding +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists table_with_non_utf8_encoding +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table table_with_non_utf8_encoding (name STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@table_with_non_utf8_encoding +POSTHOOK: query: create table table_with_non_utf8_encoding (name STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_with_non_utf8_encoding +PREHOOK: query: insert overwrite table table_with_non_utf8_encoding select name from table_with_utf8_encoding +PREHOOK: type: QUERY +PREHOOK: Input: default@table_with_utf8_encoding +PREHOOK: Output: default@table_with_non_utf8_encoding +POSTHOOK: query: insert overwrite table table_with_non_utf8_encoding select name from table_with_utf8_encoding +POSTHOOK: type: QUERY +POSTHOOK: Input: default@table_with_utf8_encoding +POSTHOOK: Output: default@table_with_non_utf8_encoding +POSTHOOK: Lineage: table_with_non_utf8_encoding.name SIMPLE [(table_with_utf8_encoding)table_with_utf8_encoding.FieldSchema(name:name, type:string, comment:null), ] +PREHOOK: query: select * from table_with_non_utf8_encoding +PREHOOK: type: QUERY +PREHOOK: Input: default@table_with_non_utf8_encoding +#### A masked pattern was here #### +POSTHOOK: query: select * from table_with_non_utf8_encoding +POSTHOOK: type: QUERY +POSTHOOK: Input: default@table_with_non_utf8_encoding +#### A masked pattern was here #### +Tao,Li +Wisgood +Benguo,Me +Xianqiang,Shen +Wensheng,Wang +Haijun,Qiao +Shilong,Zhang +Xiaoqing,You +Aiqing,Song +Zhenhua,Han +Weiqi,Peng +Hua,Li