diff --git serde/if/serde.thrift serde/if/serde.thrift index 31c87ee..76f95b5 100644 --- serde/if/serde.thrift +++ serde/if/serde.thrift @@ -30,6 +30,7 @@ const string SERIALIZATION_NULL_FORMAT = "serialization.null.format" const string SERIALIZATION_LAST_COLUMN_TAKES_REST = "serialization.last.column.takes.rest" const string SERIALIZATION_SORT_ORDER = "serialization.sort.order" const string SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object" +const string SERIALIZATION_ENCODING = "serialization.encoding" const string FIELD_DELIM = "field.delim" const string COLLECTION_DELIM = "colelction.delim" diff --git serde/src/gen/thrift/gen-cpp/serde_constants.cpp serde/src/gen/thrift/gen-cpp/serde_constants.cpp index 54503e3..bd5c16d 100644 --- serde/src/gen/thrift/gen-cpp/serde_constants.cpp +++ serde/src/gen/thrift/gen-cpp/serde_constants.cpp @@ -27,6 +27,8 @@ serdeConstants::serdeConstants() { SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object"; + SERIALIZATION_ENCODING = "serialization.encoding"; + FIELD_DELIM = "field.delim"; COLLECTION_DELIM = "colelction.delim"; diff --git serde/src/gen/thrift/gen-cpp/serde_constants.h serde/src/gen/thrift/gen-cpp/serde_constants.h index d56c917..1455382 100644 --- serde/src/gen/thrift/gen-cpp/serde_constants.h +++ serde/src/gen/thrift/gen-cpp/serde_constants.h @@ -23,6 +23,7 @@ class serdeConstants { std::string SERIALIZATION_LAST_COLUMN_TAKES_REST; std::string SERIALIZATION_SORT_ORDER; std::string SERIALIZATION_USE_JSON_OBJECTS; + std::string SERIALIZATION_ENCODING; std::string FIELD_DELIM; std::string COLLECTION_DELIM; std::string LINE_DELIM; diff --git serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java index 515cf25..8d3595b 100644 --- serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java +++ serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java @@ -49,6 +49,8 @@ public static final String SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object"; + public static final String SERIALIZATION_ENCODING = "serialization.encoding"; + public static final String FIELD_DELIM = "field.delim"; public static final String COLLECTION_DELIM = "colelction.delim"; diff --git serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php index 837dd11..3c2f0a9 100644 --- serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php +++ serde/src/gen/thrift/gen-php/org/apache/hadoop/hive/serde/Types.php @@ -32,6 +32,8 @@ $GLOBALS['serde_CONSTANTS']['SERIALIZATION_SORT_ORDER'] = "serialization.sort.or $GLOBALS['serde_CONSTANTS']['SERIALIZATION_USE_JSON_OBJECTS'] = "serialization.use.json.object"; +$GLOBALS['serde_CONSTANTS']['SERIALIZATION_ENCODING'] = "serialization.encoding"; + $GLOBALS['serde_CONSTANTS']['FIELD_DELIM'] = "field.delim"; $GLOBALS['serde_CONSTANTS']['COLLECTION_DELIM'] = "colelction.delim"; diff --git serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py index 8eac87d..08ca294 100644 --- serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py +++ serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py @@ -17,6 +17,7 @@ SERIALIZATION_LAST_COLUMN_TAKES_REST = "serialization.last.column.takes.rest" SERIALIZATION_SORT_ORDER = "serialization.sort.order" SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object" +SERIALIZATION_ENCODING = "serialization.encoding" FIELD_DELIM = "field.delim" COLLECTION_DELIM = "colelction.delim" LINE_DELIM = "line.delim" diff --git serde/src/gen/thrift/gen-rb/serde_constants.rb serde/src/gen/thrift/gen-rb/serde_constants.rb index ed86522..40375f5 100644 --- serde/src/gen/thrift/gen-rb/serde_constants.rb +++ serde/src/gen/thrift/gen-rb/serde_constants.rb @@ -23,6 +23,8 @@ SERIALIZATION_SORT_ORDER = %q"serialization.sort.order" SERIALIZATION_USE_JSON_OBJECTS = %q"serialization.use.json.object" +SERIALIZATION_ENCODING = %q"serialization.encoding" + FIELD_DELIM = %q"field.delim" COLLECTION_DELIM = %q"colelction.delim" diff --git serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java new file mode 100644 index 0000000..3668c56 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2; + +import java.nio.charset.Charset; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.Writable; + +import com.google.common.base.Charsets; + +/** + * AbstractEncodingAwareSerDe aware the encoding from table properties, + * transform data from specified charset to UTF-8 during serialize, and + * transform data from UTF-8 to specified charset during deserialize. + */ +public abstract class AbstractEncodingAwareSerDe extends AbstractSerDe { + + protected Charset charset; + + @Override + @Deprecated + public void initialize(Configuration conf, Properties tbl) + throws SerDeException { + charset = Charset.forName(tbl.getProperty(serdeConstants.SERIALIZATION_ENCODING, "UTF-8")); + } + + @Override + public final Writable serialize(Object obj, ObjectInspector objInspector) + throws SerDeException { + Writable result = doSerialize(obj, objInspector); + if (!this.charset.equals(Charsets.UTF_8)) { + result = transformFromUTF8(result); + } + return result; + } + + /** + * transform Writable data from UTF-8 to charset before serialize. + * @param blob + * @return + */ + protected abstract Writable transformFromUTF8(Writable blob); + + protected abstract Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException; + + @Override + public final Object deserialize(Writable blob) throws SerDeException { + if (!this.charset.equals(Charsets.UTF_8)) { + blob = transformToUTF8(blob); + } + return doDeserialize(blob); + } + + /** + * transform Writable data from charset to UTF-8 before doDeserialize. + * @param blob + * @return + */ + protected abstract Writable transformToUTF8(Writable blob); + + protected abstract Object doDeserialize(Writable blob) throws SerDeException; +} diff --git serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java index 179f9b5..2bdb4ea 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java @@ -46,7 +46,7 @@ public DelimitedJSONSerDe() throws SerDeException { * Not implemented. */ @Override - public Object deserialize(Writable field) throws SerDeException { + public Object doDeserialize(Writable field) throws SerDeException { LOG.error("DelimitedJSONSerDe cannot deserialize."); throw new SerDeException("DelimitedJSONSerDe cannot deserialize."); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java index b7fb048..274d468 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.serde2; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -523,4 +524,12 @@ public static void initializeSerDe(Deserializer deserializer, Configuration conf private SerDeUtils() { // prevent instantiation } + + public static Text transformTextToUTF8(Text text, Charset previousCharset) { + return new Text(new String(text.getBytes(), previousCharset)); + } + + public static Text transformTextFromUTF8(Text text, Charset targetCharset) { + return new Text(new String(text.getBytes()).getBytes(targetCharset)); + } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java index fb55c70..2c998ef 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.serde2.lazy; import java.io.IOException; +import java.nio.charset.Charset; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -29,11 +30,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -61,7 +64,7 @@ * Also LazySimpleSerDe outputs typed columns instead of treating all columns as * String like MetadataTypedColumnsetSerDe. */ -public class LazySimpleSerDe extends AbstractSerDe { +public class LazySimpleSerDe extends AbstractEncodingAwareSerDe { public static final Log LOG = LogFactory.getLog(LazySimpleSerDe.class .getName()); @@ -187,6 +190,8 @@ public byte getEscapeChar() { public void initialize(Configuration job, Properties tbl) throws SerDeException { + super.initialize(job, tbl); + serdeParams = LazySimpleSerDe.initSerdeParams(job, tbl, getClass() .getName()); @@ -330,7 +335,7 @@ public static SerDeParameters initSerdeParams(Configuration job, * @see SerDe#deserialize(Writable) */ @Override - public Object deserialize(Writable field) throws SerDeException { + public Object doDeserialize(Writable field) throws SerDeException { if (byteArrayRef == null) { byteArrayRef = new ByteArrayRef(); } @@ -375,7 +380,7 @@ public ObjectInspector getObjectInspector() throws SerDeException { * @see SerDe#serialize(Object, ObjectInspector) */ @Override - public Writable serialize(Object obj, ObjectInspector objInspector) + public Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (objInspector.getCategory() != Category.STRUCT) { @@ -584,4 +589,16 @@ public SerDeStats getSerDeStats() { return stats; } + + @Override + protected Writable transformFromUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextFromUTF8(text, this.charset); + } + + @Override + protected Writable transformToUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextToUTF8(text, this.charset); + } } diff --git service/src/gen/thrift/gen-py/TCLIService/TCLIService-remote service/src/gen/thrift/gen-py/TCLIService/TCLIService-remote old mode 100644 new mode 100755 diff --git service/src/gen/thrift/gen-py/hive_service/ThriftHive-remote service/src/gen/thrift/gen-py/hive_service/ThriftHive-remote old mode 100644 new mode 100755