diff --git serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java new file mode 100644 index 0000000..1489081 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/AbstractEncodingAwareSerDe.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2; + +import java.nio.charset.Charset; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.Writable; + +import com.google.common.base.Charsets; + +/** + * AbstractEncodingAwareSerDe aware the encoding from table properties, + * transform data from specified charset to UTF-8 during serialize, and + * transform data from UTF-8 to specified charset during deserialize. + * + */ +public abstract class AbstractEncodingAwareSerDe extends AbstractSerDe { + + protected Charset charset; + + @Override + @Deprecated + public void initialize(Configuration conf, Properties tbl) + throws SerDeException { + charset = Charset.forName(tbl.getProperty("serialization.encoding", "UTF-8")); + } + + @Override + public final Writable serialize(Object obj, ObjectInspector objInspector) + throws SerDeException { + Writable result = doSerialize(obj, objInspector); + if (!this.charset.equals(Charsets.UTF_8)) { + result = transformFromUTF8(result); + } + return result; + } + + /** + * transform Writable data from UTF-8 to charset before serialize. + * @param blob + * @return + */ + protected abstract Writable transformFromUTF8(Writable blob); + + protected abstract Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException; + + @Override + public final Object deserialize(Writable blob) throws SerDeException { + if (!this.charset.equals(Charsets.UTF_8)) { + blob = transformToUTF8(blob); + } + return doDeserialize(blob); + } + + /** + * transform Writable data from charset to UTF-8 before doDeserialize. + * @param blob + * @return + */ + protected abstract Writable transformToUTF8(Writable blob); + + protected abstract Object doDeserialize(Writable blob) throws SerDeException; +} diff --git serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java index 179f9b5..2bdb4ea 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/DelimitedJSONSerDe.java @@ -46,7 +46,7 @@ public DelimitedJSONSerDe() throws SerDeException { * Not implemented. */ @Override - public Object deserialize(Writable field) throws SerDeException { + public Object doDeserialize(Writable field) throws SerDeException { LOG.error("DelimitedJSONSerDe cannot deserialize."); throw new SerDeException("DelimitedJSONSerDe cannot deserialize."); } diff --git serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java index b7fb048..274d468 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java +++ serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.serde2; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -523,4 +524,12 @@ public static void initializeSerDe(Deserializer deserializer, Configuration conf private SerDeUtils() { // prevent instantiation } + + public static Text transformTextToUTF8(Text text, Charset previousCharset) { + return new Text(new String(text.getBytes(), previousCharset)); + } + + public static Text transformTextFromUTF8(Text text, Charset targetCharset) { + return new Text(new String(text.getBytes()).getBytes(targetCharset)); + } } diff --git serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java index fb55c70..10e93e1 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.serde2.lazy; import java.io.IOException; +import java.nio.charset.Charset; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -29,11 +30,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -61,7 +64,7 @@ * Also LazySimpleSerDe outputs typed columns instead of treating all columns as * String like MetadataTypedColumnsetSerDe. */ -public class LazySimpleSerDe extends AbstractSerDe { +public class LazySimpleSerDe extends AbstractEncodingAwareSerDe { public static final Log LOG = LogFactory.getLog(LazySimpleSerDe.class .getName()); @@ -187,6 +190,8 @@ public byte getEscapeChar() { public void initialize(Configuration job, Properties tbl) throws SerDeException { + super.initialize(job, tbl); + serdeParams = LazySimpleSerDe.initSerdeParams(job, tbl, getClass() .getName()); @@ -330,7 +335,7 @@ public static SerDeParameters initSerdeParams(Configuration job, * @see SerDe#deserialize(Writable) */ @Override - public Object deserialize(Writable field) throws SerDeException { + public Object doDeserialize(Writable field) throws SerDeException { if (byteArrayRef == null) { byteArrayRef = new ByteArrayRef(); } @@ -375,7 +380,7 @@ public ObjectInspector getObjectInspector() throws SerDeException { * @see SerDe#serialize(Object, ObjectInspector) */ @Override - public Writable serialize(Object obj, ObjectInspector objInspector) + public Writable doSerialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (objInspector.getCategory() != Category.STRUCT) { @@ -584,4 +589,16 @@ public SerDeStats getSerDeStats() { return stats; } + + @Override + protected Writable transformFromUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextFromUTF8(text, this.charset); + } + + @Override + protected Writable transformToUTF8(Writable blob) { + Text text = (Text)blob; + return SerDeUtils.transformTextToUTF8(text, this.charset); + } }