diff --git contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java index f27b0c7..89d80dc 100644 --- contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java +++ contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java @@ -29,7 +29,7 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeSpec; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -73,10 +73,10 @@ * based Regex library. */ @SerDeSpec(schemaProps = { - serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, + serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.SERIALIZATION_ENCODING, RegexSerDe.INPUT_REGEX, RegexSerDe.OUTPUT_FORMAT_STRING, RegexSerDe.INPUT_REGEX_CASE_SENSITIVE }) -public class RegexSerDe extends AbstractSerDe { +public class RegexSerDe extends AbstractEncodingAwareSerDe { public static final Logger LOG = LoggerFactory.getLogger(RegexSerDe.class.getName()); @@ -96,6 +96,7 @@ @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { + super.initialize(conf, tbl); // We can get the table definition from tbl. @@ -174,7 +175,7 @@ long getNextNumberToDisplay(long now) { } @Override - public Object deserialize(Writable blob) throws SerDeException { + public Object doDeserialize(Writable blob) throws SerDeException { if (inputPattern == null) { throw new SerDeException( @@ -218,7 +219,7 @@ public Object deserialize(Writable blob) throws SerDeException { Text outputRowText; @Override - public Writable serialize(Object obj, ObjectInspector objInspector) + public Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (outputFormatString == null) { @@ -276,4 +277,15 @@ public SerDeStats getSerDeStats() { return null; } + @Override + protected Writable transformFromUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextFromUTF8(text, this.charset); + } + + @Override + protected Writable transformToUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextToUTF8(text, this.charset); + } } diff --git contrib/src/test/queries/clientpositive/serde_regex.q contrib/src/test/queries/clientpositive/serde_regex.q index d75d607..6d21c1b 100644 --- contrib/src/test/queries/clientpositive/serde_regex.q +++ contrib/src/test/queries/clientpositive/serde_regex.q @@ -39,4 +39,32 @@ STORED AS TEXTFILE; LOAD DATA LOCAL INPATH "../../data/files/apache.access.log" INTO TABLE serde_regex; LOAD DATA LOCAL INPATH "../../data/files/apache.access.2.log" INTO TABLE serde_regex; -SELECT * FROM serde_regex ORDER BY time; \ No newline at end of file +SELECT * FROM serde_regex ORDER BY time; + + +EXPLAIN +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE; + +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2; + +SELECT key, value FROM serde_regex2 ORDER BY key, value; + +DROP TABLE serde_regex2; \ No newline at end of file diff --git contrib/src/test/results/clientpositive/serde_regex.q.out contrib/src/test/results/clientpositive/serde_regex.q.out index 2984293..84b5741 100644 --- contrib/src/test/results/clientpositive/serde_regex.q.out +++ contrib/src/test/results/clientpositive/serde_regex.q.out @@ -114,3 +114,93 @@ POSTHOOK: Input: default@serde_regex #### A masked pattern was here #### 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 NULL NULL 127.0.0.1 - - [26/May/2009:00:00:00 +0000] "GET /someurl/?track=Blabla(Main) HTTP/1.1" 200 5864 - "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.65 Safari/525.19" +PREHOOK: query: EXPLAIN +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: EXPLAIN +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Create Table Operator: + Create Table + columns: key string, value string + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat + serde name: org.apache.hadoop.hive.serde2.RegexSerDe + serde properties: + input.regex ([^ ]*),([^ ]*) + serialization.encoding ISO8859_1 + name: default.serde_regex2 + +PREHOOK: query: CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@serde_regex2 +POSTHOOK: query: CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@serde_regex2 +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@serde_regex2 +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@serde_regex2 +PREHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex2 +#### A masked pattern was here #### +Jørgensen Jørgen +Müller Thomas +NÃ¥m Fæk +Peña Andrés +PREHOOK: query: DROP TABLE serde_regex2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex2 +PREHOOK: Output: default@serde_regex2 +POSTHOOK: query: DROP TABLE serde_regex2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex2 +POSTHOOK: Output: default@serde_regex2 diff --git data/files/opencsv-data.txt data/files/opencsv-data.txt index 7d5968b..233cba6 100644 --- data/files/opencsv-data.txt +++ data/files/opencsv-data.txt @@ -1,3 +1,4 @@ why hello there,42,3,100,1412341,true,42.43,85.23423424 another record,98,4,101,9999999,false,99.89,0.00000009 -third record,45,5,102,999999999,true,89.99,0.00000000000009 \ No newline at end of file +third record,45,5,102,999999999,true,89.99,0.00000000000009 +Müller Thomas,42,3,100,1412341,true,42.43,85.23423424 \ No newline at end of file diff --git hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java index 831e857..921633a 100644 --- hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java +++ hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java @@ -38,7 +38,7 @@ import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeSpec; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -87,9 +87,10 @@ @SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, - serdeConstants.TIMESTAMP_FORMATS}) + serdeConstants.TIMESTAMP_FORMATS, + serdeConstants.SERIALIZATION_ENCODING}) -public class JsonSerDe extends AbstractSerDe { +public class JsonSerDe extends AbstractEncodingAwareSerDe { private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class); private List columnNames; @@ -103,6 +104,7 @@ @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { + super.initialize(conf, tbl); List columnTypes; StructTypeInfo rowTypeInfo; @@ -159,7 +161,7 @@ public void initialize(Configuration conf, Properties tbl) * our own object implementation, and we use HCatRecord for it */ @Override - public Object deserialize(Writable blob) throws SerDeException { + public Object doDeserialize(Writable blob) throws SerDeException { Text t = (Text) blob; JsonParser p; @@ -419,7 +421,7 @@ private Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo * and generate a Text representation of the object. */ @Override - public Writable serialize(Object obj, ObjectInspector objInspector) + public Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException { StringBuilder sb = new StringBuilder(); try { @@ -650,4 +652,15 @@ public SerDeStats getSerDeStats() { return null; } + @Override + protected Writable transformFromUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextFromUTF8(text, this.charset); + } + + @Override + protected Writable transformToUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextToUTF8(text, this.charset); + } } diff --git ql/src/test/queries/clientpositive/json_serde1.q ql/src/test/queries/clientpositive/json_serde1.q index 85f5af2..f1cb057 100644 --- ql/src/test/queries/clientpositive/json_serde1.q +++ ql/src/test/queries/clientpositive/json_serde1.q @@ -3,6 +3,7 @@ add jar ${system:maven.local.repository}/org/apache/hive/hcatalog/hive-hcatalog- drop table if exists json_serde1_1; drop table if exists json_serde1_2; +drop table if exists json_serde1_3; create table json_serde1_1 (a array,b map) row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'; @@ -32,5 +33,15 @@ insert into table json_serde1_2 select * from json_serde1_2; +create table json_serde1_3 (a array,b map) + row format serde 'org.apache.hive.hcatalog.data.JsonSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1'); + +insert into table json_serde1_3 + select array('Müller'),map('Müller',1) from src limit 2; + +select * from json_serde1_3; + drop table json_serde1_1; drop table json_serde1_2; +drop table json_serde1_3; diff --git ql/src/test/queries/clientpositive/serde_opencsv.q ql/src/test/queries/clientpositive/serde_opencsv.q index 26d79a6..66ea808 100644 --- ql/src/test/queries/clientpositive/serde_opencsv.q +++ ql/src/test/queries/clientpositive/serde_opencsv.q @@ -12,7 +12,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "separatorChar" = ",", "quoteChar" = "\'", - "escapeChar" = "\\" + "escapeChar" = "\\", + "serialization.encoding" = "ISO8859_1" ) stored as textfile; CREATE TABLE serde_opencsv( @@ -28,9 +29,10 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "separatorChar" = ",", "quoteChar" = "\'", - "escapeChar" = "\\" + "escapeChar" = "\\", + "serialization.encoding" = "ISO8859_1" ) stored as textfile; LOAD DATA LOCAL INPATH "../../data/files/opencsv-data.txt" INTO TABLE serde_opencsv; -SELECT count(*) FROM serde_opencsv; +SELECT * FROM serde_opencsv; diff --git ql/src/test/queries/clientpositive/serde_regex.q ql/src/test/queries/clientpositive/serde_regex.q index e21c6e1..e51ba14 100644 --- ql/src/test/queries/clientpositive/serde_regex.q +++ ql/src/test/queries/clientpositive/serde_regex.q @@ -41,6 +41,7 @@ SELECT host, size, status, time from serde_regex ORDER BY time; DROP TABLE serde_regex; + EXPLAIN CREATE TABLE serde_regex1( key decimal(38,18), @@ -65,3 +66,31 @@ LOAD DATA LOCAL INPATH "../../data/files/kv7.txt" INTO TABLE serde_regex1; SELECT key, value FROM serde_regex1 ORDER BY key, value; DROP TABLE serde_regex1; + + +EXPLAIN +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE; + +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2; + +SELECT key, value FROM serde_regex2 ORDER BY key, value; + +DROP TABLE serde_regex2; diff --git ql/src/test/results/clientpositive/json_serde1.q.out ql/src/test/results/clientpositive/json_serde1.q.out index e14d674..878505f 100644 --- ql/src/test/results/clientpositive/json_serde1.q.out +++ ql/src/test/results/clientpositive/json_serde1.q.out @@ -6,6 +6,10 @@ PREHOOK: query: drop table if exists json_serde1_2 PREHOOK: type: DROPTABLE POSTHOOK: query: drop table if exists json_serde1_2 POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists json_serde1_3 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists json_serde1_3 +POSTHOOK: type: DROPTABLE PREHOOK: query: create table json_serde1_1 (a array,b map) row format serde 'org.apache.hive.hcatalog.data.JsonSerDe' PREHOOK: type: CREATETABLE @@ -95,6 +99,40 @@ POSTHOOK: Input: default@json_serde1_2 #### A masked pattern was here #### [3,2,1] {1:"2001-01-01",2:null} {"c1":123456,"c2":"hello","c3":["aa","bb","cc"],"c4":{"abc":123,"xyz":456},"c5":{"c5_1":"bye","c5_2":88}} [3,2,1] {1:"2001-01-01",2:null} {"c1":123456,"c2":"hello","c3":["aa","bb","cc"],"c4":{"abc":123,"xyz":456},"c5":{"c5_1":"bye","c5_2":88}} +PREHOOK: query: create table json_serde1_3 (a array,b map) + row format serde 'org.apache.hive.hcatalog.data.JsonSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@json_serde1_3 +POSTHOOK: query: create table json_serde1_3 (a array,b map) + row format serde 'org.apache.hive.hcatalog.data.JsonSerDe' + WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@json_serde1_3 +PREHOOK: query: insert into table json_serde1_3 + select array('Müller'),map('Müller',1) from src limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@json_serde1_3 +POSTHOOK: query: insert into table json_serde1_3 + select array('Müller'),map('Müller',1) from src limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@json_serde1_3 +POSTHOOK: Lineage: json_serde1_3.a EXPRESSION [] +POSTHOOK: Lineage: json_serde1_3.b EXPRESSION [] +PREHOOK: query: select * from json_serde1_3 +PREHOOK: type: QUERY +PREHOOK: Input: default@json_serde1_3 +#### A masked pattern was here #### +POSTHOOK: query: select * from json_serde1_3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@json_serde1_3 +#### A masked pattern was here #### +["Müller"] {"Müller":1} +["Müller"] {"Müller":1} PREHOOK: query: drop table json_serde1_1 PREHOOK: type: DROPTABLE PREHOOK: Input: default@json_serde1_1 @@ -111,3 +149,11 @@ POSTHOOK: query: drop table json_serde1_2 POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@json_serde1_2 POSTHOOK: Output: default@json_serde1_2 +PREHOOK: query: drop table json_serde1_3 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@json_serde1_3 +PREHOOK: Output: default@json_serde1_3 +POSTHOOK: query: drop table json_serde1_3 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@json_serde1_3 +POSTHOOK: Output: default@json_serde1_3 diff --git ql/src/test/results/clientpositive/serde_opencsv.q.out ql/src/test/results/clientpositive/serde_opencsv.q.out index 1ffc229..6f79e31 100644 --- ql/src/test/results/clientpositive/serde_opencsv.q.out +++ ql/src/test/results/clientpositive/serde_opencsv.q.out @@ -12,7 +12,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "separatorChar" = ",", "quoteChar" = "\'", - "escapeChar" = "\\" + "escapeChar" = "\\", + "serialization.encoding" = "ISO8859_1" ) stored as textfile PREHOOK: type: CREATETABLE POSTHOOK: query: EXPLAIN @@ -29,7 +30,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "separatorChar" = ",", "quoteChar" = "\'", - "escapeChar" = "\\" + "escapeChar" = "\\", + "serialization.encoding" = "ISO8859_1" ) stored as textfile POSTHOOK: type: CREATETABLE STAGE DEPENDENCIES: @@ -47,6 +49,7 @@ STAGE PLANS: escapeChar \ quoteChar ' separatorChar , + serialization.encoding ISO8859_1 name: default.serde_opencsv PREHOOK: query: CREATE TABLE serde_opencsv( @@ -62,7 +65,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "separatorChar" = ",", "quoteChar" = "\'", - "escapeChar" = "\\" + "escapeChar" = "\\", + "serialization.encoding" = "ISO8859_1" ) stored as textfile PREHOOK: type: CREATETABLE PREHOOK: Output: database:default @@ -80,7 +84,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "separatorChar" = ",", "quoteChar" = "\'", - "escapeChar" = "\\" + "escapeChar" = "\\", + "serialization.encoding" = "ISO8859_1" ) stored as textfile POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default @@ -93,12 +98,15 @@ POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/opencsv-data.txt" INTO POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@serde_opencsv -PREHOOK: query: SELECT count(*) FROM serde_opencsv +PREHOOK: query: SELECT * FROM serde_opencsv PREHOOK: type: QUERY PREHOOK: Input: default@serde_opencsv #### A masked pattern was here #### -POSTHOOK: query: SELECT count(*) FROM serde_opencsv +POSTHOOK: query: SELECT * FROM serde_opencsv POSTHOOK: type: QUERY POSTHOOK: Input: default@serde_opencsv #### A masked pattern was here #### -3 +why hello there 42 3 100 1412341 true 42.43 85.23423424 +another record 98 4 101 9999999 false 99.89 0.00000009 +third record 45 5 102 999999999 true 89.99 0.00000000000009 +Müller Thomas 42 3 100 1412341 true 42.43 85.23423424 diff --git ql/src/test/results/clientpositive/serde_regex.q.out ql/src/test/results/clientpositive/serde_regex.q.out index 7bebb0c..c06d413 100644 --- ql/src/test/results/clientpositive/serde_regex.q.out +++ ql/src/test/results/clientpositive/serde_regex.q.out @@ -246,3 +246,93 @@ POSTHOOK: query: DROP TABLE serde_regex1 POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@serde_regex1 POSTHOOK: Output: default@serde_regex1 +PREHOOK: query: EXPLAIN +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: EXPLAIN +CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Create Table Operator: + Create Table + columns: key string, value string + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat + serde name: org.apache.hadoop.hive.serde2.RegexSerDe + serde properties: + input.regex ([^ ]*),([^ ]*) + serialization.encoding ISO8859_1 + name: default.serde_regex2 + +PREHOOK: query: CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@serde_regex2 +POSTHOOK: query: CREATE TABLE serde_regex2( + key STRING, + value STRING) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' +WITH SERDEPROPERTIES ( + "input.regex" = "([^ ]*),([^ ]*)", + "serialization.encoding" = "ISO8859_1" +) +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@serde_regex2 +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@serde_regex2 +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@serde_regex2 +PREHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value +PREHOOK: type: QUERY +PREHOOK: Input: default@serde_regex2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@serde_regex2 +#### A masked pattern was here #### +Jørgensen Jørgen +Müller Thomas +NÃ¥m Fæk +Peña Andrés +PREHOOK: query: DROP TABLE serde_regex2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@serde_regex2 +PREHOOK: Output: default@serde_regex2 +POSTHOOK: query: DROP TABLE serde_regex2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@serde_regex2 +POSTHOOK: Output: default@serde_regex2 diff --git serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java index a7059c0..097987c 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java +++ serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java @@ -50,9 +50,9 @@ * */ @SerDeSpec(schemaProps = { - serdeConstants.LIST_COLUMNS, + serdeConstants.LIST_COLUMNS, serdeConstants.SERIALIZATION_ENCODING, OpenCSVSerde.SEPARATORCHAR, OpenCSVSerde.QUOTECHAR, OpenCSVSerde.ESCAPECHAR}) -public final class OpenCSVSerde extends AbstractSerDe { +public final class OpenCSVSerde extends AbstractEncodingAwareSerDe { public static final Logger LOG = LoggerFactory.getLogger(OpenCSVSerde.class.getName()); private ObjectInspector inspector; @@ -70,6 +70,7 @@ @Override public void initialize(final Configuration conf, final Properties tbl) throws SerDeException { + super.initialize(conf, tbl); final List columnNames = Arrays.asList(tbl.getProperty(serdeConstants.LIST_COLUMNS) .split(",")); @@ -106,7 +107,7 @@ private char getProperty(final Properties tbl, final String property, final char } @Override - public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + public Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException { final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector; final List outputFieldRefs = outputRowOI.getAllStructFieldRefs(); @@ -142,7 +143,7 @@ public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDe } @Override - public Object deserialize(final Writable blob) throws SerDeException { + public Object doDeserialize(final Writable blob) throws SerDeException { Text rowText = (Text) blob; CSVReader csv = null; @@ -205,4 +206,14 @@ public ObjectInspector getObjectInspector() throws SerDeException { public SerDeStats getSerDeStats() { return null; } + + protected Text transformFromUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextFromUTF8(text, this.charset); + } + + protected Text transformToUTF8(Writable blob) { + Text text = (Text) blob; + return SerDeUtils.transformTextToUTF8(text, this.charset); + } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java index 156b410..83cd702 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java @@ -70,9 +70,9 @@ * based Regex library. */ @SerDeSpec(schemaProps = { - serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, + serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.SERIALIZATION_ENCODING, RegexSerDe.INPUT_REGEX, RegexSerDe.INPUT_REGEX_CASE_SENSITIVE }) -public class RegexSerDe extends AbstractSerDe { +public class RegexSerDe extends AbstractEncodingAwareSerDe { public static final Logger LOG = LoggerFactory.getLogger(RegexSerDe.class.getName()); @@ -96,6 +96,7 @@ @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { + super.initialize(conf, tbl); // We can get the table definition from tbl. @@ -177,7 +178,7 @@ public ObjectInspector getObjectInspector() throws SerDeException { long partialMatchedRowsCount = 0; @Override - public Object deserialize(Writable blob) throws SerDeException { + public Object doDeserialize(Writable blob) throws SerDeException { Text rowText = (Text) blob; Matcher m = inputPattern.matcher(rowText.toString()); @@ -285,7 +286,7 @@ public Object deserialize(Writable blob) throws SerDeException { } @Override - public Writable serialize(Object obj, ObjectInspector objInspector) + public Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException { throw new UnsupportedOperationException( "Regex SerDe doesn't support the serialize() method"); @@ -296,4 +297,16 @@ public SerDeStats getSerDeStats() { // no support for statistics return null; } + + @Override + protected Writable transformFromUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextFromUTF8(text, this.charset); + } + + @Override + protected Writable transformToUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextToUTF8(text, this.charset); + } }