Index: hbase-handler/src/test/org/apache/hadoop/hive/hbase/TestLazyHBaseObject.java =================================================================== --- hbase-handler/src/test/org/apache/hadoop/hive/hbase/TestLazyHBaseObject.java (revision 1307724) +++ hbase-handler/src/test/org/apache/hadoop/hive/hbase/TestLazyHBaseObject.java (working copy) @@ -63,7 +63,7 @@ Text nullSequence = new Text("\\N"); ObjectInspector oi = LazyFactory.createLazyObjectInspector( TypeInfoUtils.getTypeInfosFromTypeString("map").get(0), - new byte[]{(byte)1, (byte)2}, 0, nullSequence, false, (byte)0); + new byte[]{(byte)1, (byte)2}, 0, nullSequence, false, (byte)0, null); LazyHBaseCellMap b = new LazyHBaseCellMap((LazyMapObjectInspector) oi); @@ -124,7 +124,7 @@ Text nullSequence = new Text("\\N"); ObjectInspector oi = LazyFactory.createLazyObjectInspector( TypeInfoUtils.getTypeInfosFromTypeString("map").get(0), - new byte[]{(byte)'#', (byte)'\t'}, 0, nullSequence, false, (byte)0); + new byte[]{(byte)'#', (byte)'\t'}, 0, nullSequence, false, (byte)0, null); LazyHBaseCellMap b = new LazyHBaseCellMap((LazyMapObjectInspector) oi); @@ -186,7 +186,7 @@ Text nullSequence = new Text("\\N"); TypeInfo mapBinaryIntKeyValue = TypeInfoUtils.getTypeInfoFromTypeString("map"); ObjectInspector oi = LazyFactory.createLazyObjectInspector( - mapBinaryIntKeyValue, new byte [] {(byte)1, (byte) 2}, 0, nullSequence, false, (byte) 0); + mapBinaryIntKeyValue, new byte [] {(byte)1, (byte) 2}, 0, nullSequence, false, (byte) 0, null); LazyHBaseCellMap hbaseCellMap = new LazyHBaseCellMap((LazyMapObjectInspector) oi); List kvs = new ArrayList(); @@ -229,7 +229,7 @@ TypeInfo mapBinaryByteKeyValue = TypeInfoUtils.getTypeInfoFromTypeString("map"); oi = LazyFactory.createLazyObjectInspector( - mapBinaryByteKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, (byte) 0); + mapBinaryByteKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, (byte) 0, null); hbaseCellMap = new LazyHBaseCellMap((LazyMapObjectInspector) oi); byte [] cfByte = "cf-byte".getBytes(); kvs.clear(); @@ -267,7 +267,7 @@ TypeInfo mapBinaryShortKeyValue = TypeInfoUtils.getTypeInfoFromTypeString("map"); oi = LazyFactory.createLazyObjectInspector( - mapBinaryShortKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, (byte) 0); + mapBinaryShortKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, (byte) 0, null); hbaseCellMap = new LazyHBaseCellMap((LazyMapObjectInspector) oi); byte [] cfShort = "cf-short".getBytes(); kvs.clear(); @@ -305,7 +305,7 @@ TypeInfo mapBinaryLongKeyValue = TypeInfoUtils.getTypeInfoFromTypeString("map"); oi = LazyFactory.createLazyObjectInspector( - mapBinaryLongKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, (byte) 0); + mapBinaryLongKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, (byte) 0, null); hbaseCellMap = new LazyHBaseCellMap((LazyMapObjectInspector) oi); byte [] cfLong = "cf-long".getBytes(); kvs.clear(); @@ -344,7 +344,7 @@ TypeInfoUtils.getTypeInfoFromTypeString("map"); oi = LazyFactory.createLazyObjectInspector( mapBinaryFloatKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, - (byte) 0); + (byte) 0, null); hbaseCellMap = new LazyHBaseCellMap((LazyMapObjectInspector) oi); byte [] cfFloat = "cf-float".getBytes(); kvs.clear(); @@ -384,7 +384,7 @@ TypeInfoUtils.getTypeInfoFromTypeString("map"); oi = LazyFactory.createLazyObjectInspector( mapBinaryDoubleKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, - (byte) 0); + (byte) 0, null); hbaseCellMap = new LazyHBaseCellMap((LazyMapObjectInspector) oi); byte [] cfDouble = "cf-double".getBytes(); kvs.clear(); @@ -423,7 +423,7 @@ TypeInfoUtils.getTypeInfoFromTypeString("map"); oi = LazyFactory.createLazyObjectInspector( mapBinaryBooleanKeyValue, new byte [] {(byte) 1, (byte) 2}, 0, nullSequence, false, - (byte) 0); + (byte) 0, null); hbaseCellMap = new LazyHBaseCellMap((LazyMapObjectInspector) oi); byte [] cfBoolean = "cf-boolean".getBytes(); kvs.clear(); @@ -480,7 +480,7 @@ ObjectInspector oi = LazyFactory.createLazyStructInspector(fieldNames, fieldTypeInfos, new byte[] {' ', ':', '='}, - nullSequence, false, false, (byte)0); + nullSequence, false, false, (byte)0, null); LazyHBaseRow o = new LazyHBaseRow((LazySimpleStructObjectInspector) oi); List kvs = new ArrayList(); @@ -600,7 +600,7 @@ fieldNames, fieldTypeInfos, new byte[] {' ', ':', '='}, - nullSequence, false, false, (byte) 0); + nullSequence, false, false, (byte) 0, null); LazyHBaseRow o = new LazyHBaseRow((LazySimpleStructObjectInspector) oi); List kvs = new ArrayList(); @@ -721,7 +721,7 @@ ObjectInspector oi = LazyFactory.createLazyStructInspector(fieldNames, fieldTypeInfos, - new byte [] {' ', ':', '='}, nullSequence, false, false, (byte) 0); + new byte [] {' ', ':', '='}, nullSequence, false, false, (byte) 0, null); LazyHBaseRow o = new LazyHBaseRow((LazySimpleStructObjectInspector) oi); Index: data/files/gbk.txt =================================================================== --- data/files/gbk.txt (revision 0) +++ data/files/gbk.txt (revision 0) @@ -0,0 +1,3 @@ +×Ö·ûÒ» ×Ö·û¶ø +²âÊÔÒ» ²âÊÔ¶þ +English abc Index: serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyArrayMapStruct.java =================================================================== --- serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyArrayMapStruct.java (revision 1307724) +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyArrayMapStruct.java (working copy) @@ -46,7 +46,7 @@ Text nullSequence = new Text("\\N"); ObjectInspector oi = LazyFactory.createLazyObjectInspector(TypeInfoUtils .getTypeInfosFromTypeString("array").get(0), - new byte[] {(byte) 1}, 0, nullSequence, false, (byte) 0); + new byte[] {(byte) 1}, 0, nullSequence, false, (byte) 0, null); LazyArray b = (LazyArray) LazyFactory.createLazyObject(oi); byte[] data = new byte[] {'-', '1', 1, '\\', 'N', 1, '8'}; TestLazyPrimitive.initLazyObject(b, data, 0, data.length); @@ -68,7 +68,7 @@ // Array of String oi = LazyFactory.createLazyObjectInspector(TypeInfoUtils .getTypeInfosFromTypeString("array").get(0), - new byte[] {(byte) '\t'}, 0, nullSequence, false, (byte) 0); + new byte[] {(byte) '\t'}, 0, nullSequence, false, (byte) 0, null); b = (LazyArray) LazyFactory.createLazyObject(oi); data = new byte[] {'a', 'b', '\t', 'c', '\t', '\\', 'N', '\t', '\t', 'd'}; // Note: the first and last element of the byte[] are NOT used @@ -113,7 +113,7 @@ .createLazyObjectInspector(TypeInfoUtils .getTypeInfosFromTypeString("map").get(0), new byte[] {(byte) 1, (byte) 2}, 0, nullSequence, false, - (byte) 0); + (byte) 0, null); LazyMap b = (LazyMap) LazyFactory.createLazyObject(oi); byte[] data = new byte[] {'2', 2, 'd', 'e', 'f', 1, '-', '1', 2, '\\', 'N', 1, '0', 2, '0', 1, '8', 2, 'a', 'b', 'c'}; @@ -138,7 +138,7 @@ ObjectInspector oi = LazyFactory.createLazyObjectInspector( TypeInfoUtils.getTypeInfosFromTypeString("map").get( 0), new byte[] {(byte) '#', (byte) '\t'}, 0, nullSequence, - false, (byte) 0); + false, (byte) 0, null); LazyMap b = (LazyMap) LazyFactory.createLazyObject(oi); byte[] data = new byte[] {'2', '\t', 'd', '\t', 'f', '#', '2', '\t', 'd', '#', '-', '1', '#', '0', '\t', '0', '#', '8', '\t', 'a', 'b', 'c'}; @@ -177,7 +177,7 @@ ObjectInspector oi = LazyFactory.createLazyStructInspector(fieldNames, fieldTypeInfos, new byte[] {' ', ':', '='}, nullSequence, false, - false, (byte) 0); + false, (byte) 0, null); LazyStruct o = (LazyStruct) LazyFactory.createLazyObject(oi); Text data; @@ -229,7 +229,7 @@ // test LastColumnTakesRest oi = LazyFactory.createLazyStructInspector(Arrays.asList(new String[] { "a", "b", "c", "d"}), fieldTypeInfos, - new byte[] {' ', ':', '='}, nullSequence, true, false, (byte) 0); + new byte[] {' ', ':', '='}, nullSequence, true, false, (byte) 0, null); o = (LazyStruct) LazyFactory.createLazyObject(oi); data = new Text("\\N a d=\\N:f=g:h has tail"); TestLazyPrimitive.initLazyObject(o, data.getBytes(), 0, data @@ -255,7 +255,7 @@ Text nullSequence = new Text("\\N"); ObjectInspector oi = LazyFactory.createLazyObjectInspector(typeInfo, - new byte[] {'^', ':', '='}, 0, nullSequence, false, (byte) 0); + new byte[] {'^', ':', '='}, 0, nullSequence, false, (byte) 0, null); LazyUnion o = (LazyUnion) LazyFactory.createLazyObject(oi); Text data; Index: serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java =================================================================== --- serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java (revision 1307724) +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyPrimitive.java (working copy) @@ -360,7 +360,7 @@ public void testLazyString() throws Throwable { try { LazyString b = new LazyString(LazyPrimitiveObjectInspectorFactory - .getLazyStringObjectInspector(false, (byte) 0)); + .getLazyStringObjectInspector(false, (byte) 0, null)); initLazyObject(b, new byte[] {'0'}, 0, 0); assertEquals(new Text(""), b.getWritableObject()); initLazyObject(b, new byte[] {'0'}, 0, 1); Index: serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyCharset.java =================================================================== --- serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyCharset.java (revision 0) +++ serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazyCharset.java (revision 0) @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.lazy; + +import java.util.List; +import java.util.Properties; +import java.util.Formatter; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.Constants; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.Text; + +public class TestLazyCharset extends TestCase { + // Expected data in UTF-8 charset + byte [] zifu = {(byte)0xe5, (byte)0xad, (byte)0x97, + (byte)0xe7, (byte)0xac, (byte)0xa6}; + byte [] yi = {(byte)0xe4, (byte)0xb8, (byte)0x80}; + byte [] er = {(byte)0xe4, (byte)0xba, (byte)0x8c}; + byte [] jian = {(byte)0xe9, (byte)0x94, (byte)0xae}; + byte [] zhi = {(byte)0xe5, (byte)0x80, (byte)0xbc}; + + public void testLazyCharsetGB18030() throws Throwable { + try { + String charset = "gb18030"; + LazySimpleSerDe serDe = createSerDe(charset); + + // Input data in GB18030 charset which contains three fields using default seperators + byte [] bytes = {(byte)0xd7, (byte)0xd6, (byte)0xb7, (byte)0xfb, (byte)0x1, // string + (byte)0xd2, (byte)0xbb, (byte)0x2, (byte)0xb6, (byte)0xfe, (byte)0x1, // list + (byte)0xbc, (byte)0xfc, (byte)0x3, (byte)0xd6, (byte)0xb5}; // map + + Text t = new Text(bytes); + + serializeAndDeserialize(serDe, t, bytes); + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } + + private LazySimpleSerDe createSerDe(String charset) throws Throwable{ + // Create the SerDe + LazySimpleSerDe serDe = new LazySimpleSerDe(); + Configuration conf = new Configuration(); + Properties tbl = new Properties(); + + // Set properties + tbl.setProperty("columns", + "astring,astringlist,astringmap"); + tbl.setProperty("columns.types", + "string:array:map"); + tbl.setProperty(Constants.SERIALIZATION_NULL_FORMAT, "NULL"); + tbl.setProperty(Constants.CHARSET, charset); + + serDe.initialize(conf, tbl); + + return serDe; + } + + private void serializeAndDeserialize(LazySimpleSerDe serDe, Text t, byte [] expected) throws Throwable{ + // Test + StructObjectInspector soi = (StructObjectInspector) serDe.getObjectInspector(); + List fieldRefs = soi.getAllStructFieldRefs(); + assertEquals(3, fieldRefs.size()); + + // Deserialize + Object row = serDe.deserialize(t); + + // Test String + LazyPrimitive fieldString = (LazyPrimitive) soi.getStructFieldData(row, fieldRefs.get(0)); + assertEquals(new Text(zifu), (Text)fieldString.getWritableObject()); + + // Test String Array + LazyArray fieldArray = (LazyArray) soi.getStructFieldData(row, fieldRefs.get(1)); + assertEquals(new Text(yi), (Text)((LazyString)fieldArray.getListElementObject(0)).getWritableObject()); + assertEquals(new Text(er), (Text)((LazyString)fieldArray.getListElementObject(1)).getWritableObject()); + + // Test String Map + LazyMap fieldMap = (LazyMap) soi.getStructFieldData(row, fieldRefs.get(2)); + assertEquals(new Text(zhi), (Text)((LazyString)fieldMap.getMapValueElement(new Text(jian))).getWritableObject()); + + // Serialize + Text serializedText = (Text) serDe.serialize(row, soi); + byte [] serializedBytes = serializedText.getBytes(); + + assertEquals(expected.length, serializedText.getLength()); + for (int i=0; i encoders = new HashMap(); + private static Map decoders = new HashMap(); + + public static ByteBuffer convertToUTF8(ByteBuffer bytes, String charset) { + + if (!equalsCharset(charset, "utf-8")) { + CharsetDecoder decoder = getDecoder(charset); + CharsetEncoder encoder = getEncoder("utf-8"); + + bytes = convertCharset(bytes, decoder, encoder); + } + + return bytes; + } + + public static ByteBuffer convertFromUTF8(ByteBuffer bytes, String charset) { + if (!equalsCharset(charset, "utf-8")) { + CharsetEncoder encoder = getEncoder(charset); + CharsetDecoder decoder = getDecoder("utf-8"); + + bytes = convertCharset(bytes, decoder, encoder); + } + + return bytes; + } + + public static ByteBuffer convertCharset(ByteBuffer bb, CharsetDecoder decoder, CharsetEncoder encoder){ + + if (cachedCharBuffer == null + || (bb.remaining() * decoder.maxCharsPerByte() > cachedCharBuffer.capacity())){ + // allocate for cachedCharBuffer + cachedCharBuffer = CharBuffer.allocate((int) (bb.remaining() * decoder.maxCharsPerByte() * 2)); + } + cachedCharBuffer.clear(); + + CoderResult coderResult = decoder.decode(bb, cachedCharBuffer, true); + assert coderResult != CoderResult.OVERFLOW; + cachedCharBuffer.flip(); + + if (cachedByteBuffer == null + || (cachedCharBuffer.remaining() * encoder.maxBytesPerChar() > cachedByteBuffer.capacity())){ + // allocate for cachedByteBuffer + cachedByteBuffer = ByteBuffer.allocate((int) (cachedCharBuffer.remaining() * encoder.maxBytesPerChar() * 2)); + } else{ + cachedByteBuffer.clear(); + } + + coderResult = encoder.encode(cachedCharBuffer, cachedByteBuffer, true); + assert coderResult != CoderResult.OVERFLOW; + cachedByteBuffer.flip(); + + return cachedByteBuffer; + } + + + public static CharsetEncoder getEncoder(String charset) { + Charset cs = Charset.forName(charset); + + if (encoders.containsKey(cs)) { + return encoders.get(cs); + } else { + CharsetEncoder encoder = cs.newEncoder(); + encoders.put(cs, encoder); + return encoder; + } + } + + public static CharsetDecoder getDecoder(String charset) { + Charset cs = Charset.forName(charset); + + if (decoders.containsKey(cs)) { + return decoders.get(cs); + } else { + CharsetDecoder decoder = cs.newDecoder(); + decoders.put(cs, decoder); + return decoder; + } + } + + public static boolean equalsCharset(String charset1, String charset2){ + return Charset.forName(charset1).equals(Charset.forName(charset2)); + } + private LazyUtils() { // prevent instantiation } Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyString.java (working copy) @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hive.serde2.lazy; +import java.nio.ByteBuffer; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector; import org.apache.hadoop.io.Text; @@ -37,9 +38,19 @@ @Override public void init(ByteArrayRef bytes, int start, int length) { + + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes.getData(), start, length); + + if (oi.getCharset() != null) { + byteBuffer = LazyUtils + .convertToUTF8(byteBuffer, oi.getCharset()); + start = byteBuffer.position(); + length = byteBuffer.remaining(); + } + if (oi.isEscaped()) { byte escapeChar = oi.getEscapeChar(); - byte[] inputBytes = bytes.getData(); + byte[] inputBytes = byteBuffer.array(); // First calculate the length of the output string int outputLength = 0; @@ -54,14 +65,14 @@ // Copy the data over, so that the internal state of Text will be set to // the required outputLength. - data.set(bytes.getData(), start, outputLength); + data.set(byteBuffer.array(), start, outputLength); // We need to copy the data byte by byte only in case the // "outputLength < length" (which means there is at least one escaped // byte. if (outputLength < length) { int k = 0; - byte[] outputBytes = data.getBytes(); + byte[] outputBytes = byteBuffer.array(); for (int i = 0; i < length; i++) { byte b = inputBytes[start + i]; if (b != escapeChar || i == length - 1) { @@ -76,7 +87,7 @@ } } else { // if the data is not escaped, simply copy the data. - data.set(bytes.getData(), start, length); + data.set(byteBuffer.array(), start, length); } } Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java (working copy) @@ -200,28 +200,28 @@ */ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { ObjectInspector.Category c = typeInfo.getCategory(); switch (c) { case PRIMITIVE: return LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector( ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory(), escaped, - escapeChar); + escapeChar, charset); case MAP: return LazyObjectInspectorFactory.getLazySimpleMapObjectInspector( createLazyObjectInspector(((MapTypeInfo) typeInfo) .getMapKeyTypeInfo(), separator, separatorIndex + 2, - nullSequence, escaped, escapeChar), createLazyObjectInspector( + nullSequence, escaped, escapeChar, charset), createLazyObjectInspector( ((MapTypeInfo) typeInfo).getMapValueTypeInfo(), separator, - separatorIndex + 2, nullSequence, escaped, escapeChar), + separatorIndex + 2, nullSequence, escaped, escapeChar, charset), separator[separatorIndex], separator[separatorIndex + 1], - nullSequence, escaped, escapeChar); + nullSequence, escaped, escapeChar, charset); case LIST: return LazyObjectInspectorFactory.getLazySimpleListObjectInspector( createLazyObjectInspector(((ListTypeInfo) typeInfo) .getListElementTypeInfo(), separator, separatorIndex + 1, - nullSequence, escaped, escapeChar), separator[separatorIndex], - nullSequence, escaped, escapeChar); + nullSequence, escaped, escapeChar, charset), separator[separatorIndex], + nullSequence, escaped, escapeChar, charset); case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; List fieldNames = structTypeInfo.getAllStructFieldNames(); @@ -232,21 +232,21 @@ for (int i = 0; i < fieldTypeInfos.size(); i++) { fieldObjectInspectors.add(createLazyObjectInspector(fieldTypeInfos .get(i), separator, separatorIndex + 1, nullSequence, escaped, - escapeChar)); + escapeChar, charset)); } return LazyObjectInspectorFactory.getLazySimpleStructObjectInspector( fieldNames, fieldObjectInspectors, separator[separatorIndex], - nullSequence, false, escaped, escapeChar); + nullSequence, false, escaped, escapeChar, charset); case UNION: UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo; List lazyOIs = new ArrayList(); for (TypeInfo uti : unionTypeInfo.getAllUnionObjectTypeInfos()) { lazyOIs.add(createLazyObjectInspector(uti, separator, separatorIndex + 1, nullSequence, escaped, - escapeChar)); + escapeChar, charset)); } return LazyObjectInspectorFactory.getLazyUnionObjectInspector(lazyOIs, - separator[separatorIndex], nullSequence, escaped, escapeChar); + separator[separatorIndex], nullSequence, escaped, escapeChar, charset); } throw new RuntimeException("Hive LazySerDe Internal error."); @@ -265,16 +265,16 @@ public static ObjectInspector createLazyStructInspector( List columnNames, List typeInfos, byte[] separators, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { ArrayList columnObjectInspectors = new ArrayList( typeInfos.size()); for (int i = 0; i < typeInfos.size(); i++) { columnObjectInspectors.add(LazyFactory.createLazyObjectInspector( - typeInfos.get(i), separators, 1, nullSequence, escaped, escapeChar)); + typeInfos.get(i), separators, 1, nullSequence, escaped, escapeChar, charset)); } return LazyObjectInspectorFactory.getLazySimpleStructObjectInspector( columnNames, columnObjectInspectors, separators[0], nullSequence, - lastColumnTakesRest, escaped, escapeChar); + lastColumnTakesRest, escaped, escapeChar, charset); } /** @@ -286,13 +286,13 @@ */ public static ObjectInspector createColumnarStructInspector( List columnNames, List columnTypes, byte[] separators, - Text nullSequence, boolean escaped, byte escapeChar) { + Text nullSequence, boolean escaped, byte escapeChar, String charset) { ArrayList columnObjectInspectors = new ArrayList( columnTypes.size()); for (int i = 0; i < columnTypes.size(); i++) { columnObjectInspectors .add(LazyFactory.createLazyObjectInspector(columnTypes.get(i), - separators, 1, nullSequence, escaped, escapeChar)); + separators, 1, nullSequence, escaped, escapeChar, charset)); } return ObjectInspectorFactory.getColumnarStructObjectInspector(columnNames, columnObjectInspectors); Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java (working copy) @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.serde2.lazy; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -65,6 +66,7 @@ .getName()); public static final byte[] DefaultSeparators = {(byte) 1, (byte) 2, (byte) 3}; + public static final String DefaultCharset = "utf-8"; private ObjectInspector cachedObjectInspector; @@ -125,6 +127,9 @@ byte escapeChar; boolean[] needsEscape; + String charset; + boolean needTransform; + public List getColumnTypes() { return columnTypes; } @@ -164,6 +169,10 @@ public boolean[] getNeedsEscape() { return needsEscape; } + + public String getCharset() { + return charset; + } } SerDeParameters serdeParams = null; @@ -187,7 +196,7 @@ .getColumnNames(), serdeParams.getColumnTypes(), serdeParams .getSeparators(), serdeParams.getNullSequence(), serdeParams .isLastColumnTakesRest(), serdeParams.isEscaped(), serdeParams - .getEscapeChar()); + .getEscapeChar(), serdeParams.getCharset()); cachedLazyStruct = (LazyStruct) LazyFactory .createLazyObject(cachedObjectInspector); @@ -254,6 +263,10 @@ } } + // Get charset + String charset = tbl.getProperty(Constants.CHARSET, DefaultCharset); + serdeParams.charset = charset; + return serdeParams; } @@ -365,11 +378,18 @@ serializeField(serializeStream, f, foi, serdeParams); } + ByteBuffer byteBuffer = ByteBuffer.wrap(serializeStream.getData(), 0, serializeStream.getCount()); + if (serdeParams.getCharset() != null + && !LazyUtils.equalsCharset(serdeParams.getCharset(), DefaultCharset)) { + byteBuffer = LazyUtils.convertFromUTF8(byteBuffer, serdeParams.getCharset()); + } + // TODO: The copy of data is unnecessary, but there is no work-around // since we cannot directly set the private byte[] field inside Text. serializeCache - .set(serializeStream.getData(), 0, serializeStream.getCount()); - serializedSize = serializeStream.getCount(); + .set(byteBuffer.array(), byteBuffer.position(), byteBuffer.remaining()); + + serializedSize = byteBuffer.remaining(); lastOperationSerialize = true; lastOperationDeserialize = false; return serializeCache; Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java (working copy) @@ -45,17 +45,17 @@ List structFieldNames, List structFieldObjectInspectors, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { return getLazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, null, separator, nullSequence, - lastColumnTakesRest, escaped, escapeChar); + lastColumnTakesRest, escaped, escapeChar, charset); } public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector( List structFieldNames, List structFieldObjectInspectors, List structFieldComments, byte separator, Text nullSequence, boolean lastColumnTakesRest, - boolean escaped,byte escapeChar) { + boolean escaped,byte escapeChar, String charset) { ArrayList signature = new ArrayList(); signature.add(structFieldNames); signature.add(structFieldObjectInspectors); @@ -64,6 +64,7 @@ signature.add(Boolean.valueOf(lastColumnTakesRest)); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(charset); if(structFieldComments != null) { signature.add(structFieldComments); } @@ -72,7 +73,7 @@ if (result == null) { result = new LazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, structFieldComments, separator, - nullSequence, lastColumnTakesRest, escaped, escapeChar); + nullSequence, lastColumnTakesRest, escaped, escapeChar, charset); cachedLazySimpleStructObjectInspector.put(signature, result); } return result; @@ -82,18 +83,19 @@ public static LazyListObjectInspector getLazySimpleListObjectInspector( ObjectInspector listElementObjectInspector, byte separator, - Text nullSequence, boolean escaped, byte escapeChar) { + Text nullSequence, boolean escaped, byte escapeChar, String charset) { ArrayList signature = new ArrayList(); signature.add(listElementObjectInspector); signature.add(Byte.valueOf(separator)); signature.add(nullSequence.toString()); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(charset); LazyListObjectInspector result = cachedLazySimpleListObjectInspector .get(signature); if (result == null) { result = new LazyListObjectInspector(listElementObjectInspector, - separator, nullSequence, escaped, escapeChar); + separator, nullSequence, escaped, escapeChar, charset); cachedLazySimpleListObjectInspector.put(signature, result); } return result; @@ -105,7 +107,7 @@ ObjectInspector mapKeyObjectInspector, ObjectInspector mapValueObjectInspector, byte itemSeparator, byte keyValueSeparator, Text nullSequence, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { ArrayList signature = new ArrayList(); signature.add(mapKeyObjectInspector); signature.add(mapValueObjectInspector); @@ -114,12 +116,13 @@ signature.add(nullSequence.toString()); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(charset); LazyMapObjectInspector result = cachedLazySimpleMapObjectInspector .get(signature); if (result == null) { result = new LazyMapObjectInspector(mapKeyObjectInspector, mapValueObjectInspector, itemSeparator, keyValueSeparator, - nullSequence, escaped, escapeChar); + nullSequence, escaped, escapeChar, charset); cachedLazySimpleMapObjectInspector.put(signature, result); } return result; @@ -131,18 +134,19 @@ public static LazyUnionObjectInspector getLazyUnionObjectInspector( List ois, byte separator, Text nullSequence, - boolean escaped, byte escapeChar) { + boolean escaped, byte escapeChar, String charset) { List signature = new ArrayList(); signature.add(ois); signature.add(Byte.valueOf(separator)); signature.add(nullSequence.toString()); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(charset); LazyUnionObjectInspector result = cachedLazyUnionObjectInspector .get(signature); if (result == null) { result = new LazyUnionObjectInspector(ois, separator, - nullSequence, escaped, escapeChar); + nullSequence, escaped, escapeChar, charset); cachedLazyUnionObjectInspector.put(signature, result); } return result; Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyUnionObjectInspector.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyUnionObjectInspector.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyUnionObjectInspector.java (working copy) @@ -52,23 +52,25 @@ Text nullSequence; boolean escaped; byte escapeChar; + String charset; protected LazyUnionObjectInspector( List ois, byte separator, Text nullSequence, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { init(ois, separator, - nullSequence, escaped, escapeChar); + nullSequence, escaped, escapeChar, charset); } protected void init( List ois, byte separator, Text nullSequence, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { this.separator = separator; this.nullSequence = nullSequence; this.escaped = escaped; this.escapeChar = escapeChar; + this.charset = charset; this.ois = new ArrayList(); this.ois.addAll(ois); } @@ -107,6 +109,10 @@ return escapeChar; } + public String getCharset() { + return charset; + } + @Override public Object getField(Object data) { if (data == null) { Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyStringObjectInspector.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyStringObjectInspector.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyStringObjectInspector.java (working copy) @@ -30,11 +30,13 @@ boolean escaped; byte escapeChar; + String charset; - LazyStringObjectInspector(boolean escaped, byte escapeChar) { + LazyStringObjectInspector(boolean escaped, byte escapeChar, String charset) { super(PrimitiveObjectInspectorUtils.stringTypeEntry); this.escaped = escaped; this.escapeChar = escapeChar; + this.charset = charset; } @Override @@ -60,4 +62,7 @@ return escapeChar; } + public String getCharset() { + return charset; + } } Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java (working copy) @@ -62,21 +62,22 @@ new HashMap, LazyStringObjectInspector>(); public static LazyStringObjectInspector getLazyStringObjectInspector( - boolean escaped, byte escapeChar) { + boolean escaped, byte escapeChar, String charset) { ArrayList signature = new ArrayList(); signature.add(Boolean.valueOf(escaped)); signature.add(Byte.valueOf(escapeChar)); + signature.add(charset); LazyStringObjectInspector result = cachedLazyStringObjectInspector .get(signature); if (result == null) { - result = new LazyStringObjectInspector(escaped, escapeChar); + result = new LazyStringObjectInspector(escaped, escapeChar, charset); cachedLazyStringObjectInspector.put(signature, result); } return result; } public static AbstractPrimitiveLazyObjectInspector getLazyObjectInspector( - PrimitiveCategory primitiveCategory, boolean escaped, byte escapeChar) { + PrimitiveCategory primitiveCategory, boolean escaped, byte escapeChar, String charset) { switch (primitiveCategory) { case BOOLEAN: @@ -94,7 +95,7 @@ case DOUBLE: return LAZY_DOUBLE_OBJECT_INSPECTOR; case STRING: - return getLazyStringObjectInspector(escaped, escapeChar); + return getLazyStringObjectInspector(escaped, escapeChar, charset); case BINARY: return LAZY_BINARY_OBJECT_INSPECTOR; case VOID: Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyListObjectInspector.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyListObjectInspector.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyListObjectInspector.java (working copy) @@ -44,17 +44,19 @@ Text nullSequence; boolean escaped; byte escapeChar; + String charset; /** * Call ObjectInspectorFactory.getLazySimpleListObjectInspector instead. */ protected LazyListObjectInspector(ObjectInspector listElementObjectInspector, - byte separator, Text nullSequence, boolean escaped, byte escapeChar) { + byte separator, Text nullSequence, boolean escaped, byte escapeChar, String charset) { this.listElementObjectInspector = listElementObjectInspector; this.separator = separator; this.nullSequence = nullSequence; this.escaped = escaped; this.escapeChar = escapeChar; + this.charset = charset; } @Override Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyMapObjectInspector.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyMapObjectInspector.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyMapObjectInspector.java (working copy) @@ -46,6 +46,7 @@ Text nullSequence; boolean escaped; byte escapeChar; + String charset; /** * Call ObjectInspectorFactory.getStandardListObjectInspector instead. @@ -53,7 +54,7 @@ protected LazyMapObjectInspector(ObjectInspector mapKeyObjectInspector, ObjectInspector mapValueObjectInspector, byte itemSeparator, byte keyValueSeparator, Text nullSequence, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { this.mapKeyObjectInspector = mapKeyObjectInspector; this.mapValueObjectInspector = mapValueObjectInspector; @@ -62,6 +63,7 @@ this.nullSequence = nullSequence; this.escaped = escaped; this.escapeChar = escapeChar; + this.charset = charset; } @Override Index: serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazySimpleStructObjectInspector.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazySimpleStructObjectInspector.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazySimpleStructObjectInspector.java (working copy) @@ -97,6 +97,7 @@ boolean lastColumnTakesRest; boolean escaped; byte escapeChar; + String charset; /** * Call ObjectInspectorFactory.getLazySimpleStructObjectInspector instead. @@ -104,24 +105,25 @@ protected LazySimpleStructObjectInspector(List structFieldNames, List structFieldObjectInspectors, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { init(structFieldNames, structFieldObjectInspectors, null, separator, - nullSequence, lastColumnTakesRest, escaped, escapeChar); + nullSequence, lastColumnTakesRest, escaped, escapeChar, charset); } public LazySimpleStructObjectInspector(List structFieldNames, List structFieldObjectInspectors, List structFieldComments, byte separator, Text nullSequence, - boolean lastColumnTakesRest, boolean escaped, byte escapeChar) { + boolean lastColumnTakesRest, boolean escaped, byte escapeChar, + String charset) { init(structFieldNames, structFieldObjectInspectors, structFieldComments, - separator, nullSequence, lastColumnTakesRest, escaped, escapeChar); + separator, nullSequence, lastColumnTakesRest, escaped, escapeChar, charset); } protected void init(List structFieldNames, List structFieldObjectInspectors, List structFieldComments, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, - byte escapeChar) { + byte escapeChar, String charset) { assert (structFieldNames.size() == structFieldObjectInspectors.size()); assert (structFieldComments == null || structFieldNames.size() == structFieldComments.size()); @@ -131,6 +133,7 @@ this.lastColumnTakesRest = lastColumnTakesRest; this.escaped = escaped; this.escapeChar = escapeChar; + this.charset = charset; fields = new ArrayList(structFieldNames.size()); for (int i = 0; i < structFieldNames.size(); i++) { @@ -218,4 +221,8 @@ return escapeChar; } + public String getCharset() { + return charset; + } + } Index: serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java (revision 1307724) +++ serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java (working copy) @@ -88,7 +88,7 @@ cachedObjectInspector = LazyFactory.createColumnarStructInspector( serdeParams.getColumnNames(), serdeParams.getColumnTypes(), serdeParams .getSeparators(), serdeParams.getNullSequence(), serdeParams - .isEscaped(), serdeParams.getEscapeChar()); + .isEscaped(), serdeParams.getEscapeChar(), serdeParams.getCharset()); java.util.ArrayList notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(job); Index: serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py =================================================================== --- serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py (revision 1307724) +++ serde/src/gen/thrift/gen-py/org_apache_hadoop_hive_serde/constants.py (working copy) @@ -21,6 +21,7 @@ MAPKEY_DELIM = "mapkey.delim" QUOTE_CHAR = "quote.delim" ESCAPE_CHAR = "escape.delim" +CHARSET = "charset" VOID_TYPE_NAME = "void" BOOLEAN_TYPE_NAME = "boolean" TINYINT_TYPE_NAME = "tinyint" Index: serde/src/gen/thrift/gen-cpp/serde_constants.cpp =================================================================== --- serde/src/gen/thrift/gen-cpp/serde_constants.cpp (revision 1307724) +++ serde/src/gen/thrift/gen-cpp/serde_constants.cpp (working copy) @@ -38,6 +38,8 @@ ESCAPE_CHAR = "escape.delim"; + CHARSET = "charset"; + VOID_TYPE_NAME = "void"; BOOLEAN_TYPE_NAME = "boolean"; Index: serde/src/gen/thrift/gen-cpp/serde_constants.h =================================================================== --- serde/src/gen/thrift/gen-cpp/serde_constants.h (revision 1307724) +++ serde/src/gen/thrift/gen-cpp/serde_constants.h (working copy) @@ -28,6 +28,7 @@ std::string MAPKEY_DELIM; std::string QUOTE_CHAR; std::string ESCAPE_CHAR; + std::string CHARSET; std::string VOID_TYPE_NAME; std::string BOOLEAN_TYPE_NAME; std::string TINYINT_TYPE_NAME; Index: serde/src/gen/thrift/gen-rb/serde_constants.rb =================================================================== --- serde/src/gen/thrift/gen-rb/serde_constants.rb (revision 1307724) +++ serde/src/gen/thrift/gen-rb/serde_constants.rb (working copy) @@ -34,6 +34,8 @@ ESCAPE_CHAR = %q"escape.delim" +CHARSET = %q"charset" + VOID_TYPE_NAME = %q"void" BOOLEAN_TYPE_NAME = %q"boolean" Index: serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/Constants.java =================================================================== --- serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/Constants.java (revision 1307724) +++ serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/Constants.java (working copy) @@ -50,6 +50,8 @@ public static final String ESCAPE_CHAR = "escape.delim"; + public static final String CHARSET = "charset"; + public static final String VOID_TYPE_NAME = "void"; public static final String BOOLEAN_TYPE_NAME = "boolean"; Index: serde/src/gen/thrift/gen-php/serde/serde_constants.php =================================================================== --- serde/src/gen/thrift/gen-php/serde/serde_constants.php (revision 1307724) +++ serde/src/gen/thrift/gen-php/serde/serde_constants.php (working copy) @@ -36,6 +36,8 @@ $GLOBALS['serde_CONSTANTS']['ESCAPE_CHAR'] = "escape.delim"; +$GLOBALS['serde_CONSTANTS']['CHARSET'] = "charset"; + $GLOBALS['serde_CONSTANTS']['VOID_TYPE_NAME'] = "void"; $GLOBALS['serde_CONSTANTS']['BOOLEAN_TYPE_NAME'] = "boolean"; Index: serde/if/serde.thrift =================================================================== --- serde/if/serde.thrift (revision 1307724) +++ serde/if/serde.thrift (working copy) @@ -20,6 +20,7 @@ const string MAPKEY_DELIM = "mapkey.delim" const string QUOTE_CHAR = "quote.delim" const string ESCAPE_CHAR = "escape.delim" +const string CHARSET = "charset" typedef string PrimitiveType typedef string CollectionType Index: ql/src/test/results/clientpositive/charset.q.out =================================================================== --- ql/src/test/results/clientpositive/charset.q.out (revision 0) +++ ql/src/test/results/clientpositive/charset.q.out (revision 0) @@ -0,0 +1,62 @@ +PREHOOK: query: EXPLAIN +CREATE TABLE test_charset(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t' +PREHOOK: type: CREATETABLE +POSTHOOK: query: EXPLAIN +CREATE TABLE test_charset(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t' +POSTHOOK: type: CREATETABLE +ABSTRACT SYNTAX TREE: + (TOK_CREATETABLE (TOK_TABNAME test_charset) TOK_LIKETABLE (TOK_TABCOLLIST (TOK_TABCOL str1 TOK_STRING) (TOK_TABCOL str2 TOK_STRING)) (TOK_TABLEROWFORMAT (TOK_CHARSET 'gbk') (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\t')))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Create Table Operator: + Create Table + charset: gbk + columns: str1 string, str2 string + field delimiter: + if not exists: false + input format: org.apache.hadoop.mapred.TextInputFormat + # buckets: -1 + output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat + primaryRegionName: + name: test_charset + isExternal: false + + +PREHOOK: query: CREATE TABLE test_charset(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_charset(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_charset +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/gbk.txt' OVERWRITE INTO TABLE test_charset +PREHOOK: type: LOAD +PREHOOK: Output: default@test_charset +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/gbk.txt' OVERWRITE INTO TABLE test_charset +POSTHOOK: type: LOAD +POSTHOOK: Output: default@test_charset +PREHOOK: query: SELECT str1, str2 FROM test_charset LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_charset +#### A masked pattern was here #### +POSTHOOK: query: SELECT str1, str2 FROM test_charset LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_charset +#### A masked pattern was here #### +字符一 字符而 +测试一 测试二 +English abc +PREHOOK: query: DROP TABLE test_charset +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_charset +PREHOOK: Output: default@test_charset +POSTHOOK: query: DROP TABLE test_charset +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_charset +POSTHOOK: Output: default@test_charset Index: ql/src/test/results/clientpositive/input35.q.out =================================================================== --- ql/src/test/results/clientpositive/input35.q.out (revision 1307724) +++ ql/src/test/results/clientpositive/input35.q.out (working copy) @@ -22,7 +22,7 @@ INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue POSTHOOK: type: QUERY ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TRANSFORM (TOK_EXPLIST (. (TOK_TABLE_OR_COL src) key) (. (TOK_TABLE_OR_COL src) value)) (TOK_SERDE (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\002'))) TOK_RECORDWRITER '/bin/cat' (TOK_SERDE (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\002'))) TOK_RECORDREADER (TOK_ALIASLIST tkey tvalue)))))) tmap)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL tkey)) (TOK_SELEXPR (TOK_TABLE_OR_COL tvalue))))) + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TRANSFORM (TOK_EXPLIST (. (TOK_TABLE_OR_COL src) key) (. (TOK_TABLE_OR_COL src) value)) (TOK_SERDE (TOK_ROWFORMAT (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\002')))) TOK_RECORDWRITER '/bin/cat' (TOK_SERDE (TOK_ROWFORMAT (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\002')))) TOK_RECORDREADER (TOK_ALIASLIST tkey tvalue)))))) tmap)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL tkey)) (TOK_SELEXPR (TOK_TABLE_OR_COL tvalue))))) STAGE DEPENDENCIES: Stage-1 is a root stage Index: ql/src/test/results/clientpositive/input36.q.out =================================================================== --- ql/src/test/results/clientpositive/input36.q.out (revision 1308023) +++ ql/src/test/results/clientpositive/input36.q.out (working copy) @@ -22,7 +22,7 @@ INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue POSTHOOK: type: QUERY ABSTRACT SYNTAX TREE: - (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TRANSFORM (TOK_EXPLIST (. (TOK_TABLE_OR_COL src) key) (. (TOK_TABLE_OR_COL src) value)) (TOK_SERDE (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\002'))) TOK_RECORDWRITER '/bin/cat' (TOK_SERDE (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\003'))) TOK_RECORDREADER (TOK_ALIASLIST tkey tvalue)))))) tmap)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL tkey)) (TOK_SELEXPR (TOK_TABLE_OR_COL tvalue))))) + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TRANSFORM (TOK_EXPLIST (. (TOK_TABLE_OR_COL src) key) (. (TOK_TABLE_OR_COL src) value)) (TOK_SERDE (TOK_ROWFORMAT (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\002')))) TOK_RECORDWRITER '/bin/cat' (TOK_SERDE (TOK_ROWFORMAT (TOK_SERDEPROPS (TOK_TABLEROWFORMATFIELD '\003')))) TOK_RECORDREADER (TOK_ALIASLIST tkey tvalue)))))) tmap)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME dest1))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL tkey)) (TOK_SELEXPR (TOK_TABLE_OR_COL tvalue))))) STAGE DEPENDENCIES: Stage-1 is a root stage Index: ql/src/test/results/clientpositive/transform_charset.q.out =================================================================== --- ql/src/test/results/clientpositive/transform_charset.q.out (revision 0) +++ ql/src/test/results/clientpositive/transform_charset.q.out (revision 0) @@ -0,0 +1,80 @@ +PREHOOK: query: CREATE TABLE test_gbk(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE test_gbk(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_gbk +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/gbk.txt' OVERWRITE INTO TABLE test_gbk +PREHOOK: type: LOAD +PREHOOK: Output: default@test_gbk +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/gbk.txt' OVERWRITE INTO TABLE test_gbk +POSTHOOK: type: LOAD +POSTHOOK: Output: default@test_gbk +PREHOOK: query: EXPLAIN +FROM test_gbk +SELECT TRANSFORM(test_gbk.str1, test_gbk.str2) ROW FORMAT CHARSET 'gbk' +USING '/bin/cat' +AS (str_utf8_1, str_utf8_2) ROW FORMAT CHARSET 'gbk' +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +FROM test_gbk +SELECT TRANSFORM(test_gbk.str1, test_gbk.str2) ROW FORMAT CHARSET 'gbk' +USING '/bin/cat' +AS (str_utf8_1, str_utf8_2) ROW FORMAT CHARSET 'gbk' +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME test_gbk))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TRANSFORM (TOK_EXPLIST (. (TOK_TABLE_OR_COL test_gbk) str1) (. (TOK_TABLE_OR_COL test_gbk) str2)) (TOK_SERDE TOK_ROWFORMAT (TOK_CHARSET 'gbk')) TOK_RECORDWRITER '/bin/cat' (TOK_SERDE TOK_ROWFORMAT (TOK_CHARSET 'gbk')) TOK_RECORDREADER (TOK_ALIASLIST str_utf8_1 str_utf8_2)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + test_gbk + TableScan + alias: test_gbk + Select Operator + expressions: + expr: str1 + type: string + expr: str2 + type: string + outputColumnNames: _col0, _col1 + Transform Operator + command: /bin/cat + output info: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: FROM test_gbk +SELECT TRANSFORM(test_gbk.str1, test_gbk.str2) ROW FORMAT CHARSET 'gbk' +USING '/bin/cat' +AS (str_utf8_1, str_utf8_2) ROW FORMAT CHARSET 'gbk' +PREHOOK: type: QUERY +PREHOOK: Input: default@test_gbk +#### A masked pattern was here #### +POSTHOOK: query: FROM test_gbk +SELECT TRANSFORM(test_gbk.str1, test_gbk.str2) ROW FORMAT CHARSET 'gbk' +USING '/bin/cat' +AS (str_utf8_1, str_utf8_2) ROW FORMAT CHARSET 'gbk' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_gbk +#### A masked pattern was here #### +字符一 字符而 +测试一 测试二 +English abc Index: ql/src/test/queries/clientpositive/transform_charset.q =================================================================== --- ql/src/test/queries/clientpositive/transform_charset.q (revision 0) +++ ql/src/test/queries/clientpositive/transform_charset.q (revision 0) @@ -0,0 +1,15 @@ +CREATE TABLE test_gbk(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t'; + +LOAD DATA LOCAL INPATH '../data/files/gbk.txt' OVERWRITE INTO TABLE test_gbk; + +EXPLAIN +FROM test_gbk +SELECT TRANSFORM(test_gbk.str1, test_gbk.str2) ROW FORMAT CHARSET 'gbk' +USING '/bin/cat' +AS (str_utf8_1, str_utf8_2) ROW FORMAT CHARSET 'gbk'; + +FROM test_gbk +SELECT TRANSFORM(test_gbk.str1, test_gbk.str2) ROW FORMAT CHARSET 'gbk' +USING '/bin/cat' +AS (str_utf8_1, str_utf8_2) ROW FORMAT CHARSET 'gbk'; Index: ql/src/test/queries/clientpositive/charset.q =================================================================== --- ql/src/test/queries/clientpositive/charset.q (revision 0) +++ ql/src/test/queries/clientpositive/charset.q (revision 0) @@ -0,0 +1,12 @@ +EXPLAIN +CREATE TABLE test_charset(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t'; + +CREATE TABLE test_charset(str1 STRING, str2 STRING) + ROW FORMAT CHARSET 'gbk' DELIMITED FIELDS TERMINATED BY '\t'; + +LOAD DATA LOCAL INPATH '../data/files/gbk.txt' OVERWRITE INTO TABLE test_charset; + +SELECT str1, str2 FROM test_charset LIMIT 10; + +DROP TABLE test_charset; \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/plan/CreateTableDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/CreateTableDesc.java (revision 1307724) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/CreateTableDesc.java (working copy) @@ -42,6 +42,7 @@ ArrayList bucketCols; ArrayList sortCols; int numBuckets; + String charset; String fieldDelim; String fieldEscape; String collItemDelim; @@ -64,6 +65,7 @@ public CreateTableDesc(String databaseName, String tableName, boolean isExternal, List cols, List partCols, List bucketCols, List sortCols, int numBuckets, + String charset, String fieldDelim, String fieldEscape, String collItemDelim, String mapKeyDelim, String lineDelim, String comment, String inputFormat, String outputFormat, String location, String serName, @@ -74,7 +76,7 @@ String primaryRegionName) { this(tableName, isExternal, cols, partCols, - bucketCols, sortCols, numBuckets, fieldDelim, fieldEscape, + bucketCols, sortCols, numBuckets, charset, fieldDelim, fieldEscape, collItemDelim, mapKeyDelim, lineDelim, comment, inputFormat, outputFormat, location, serName, storageHandler, serdeProps, tblProps, ifNotExists, primaryRegionName); @@ -85,6 +87,7 @@ public CreateTableDesc(String tableName, boolean isExternal, List cols, List partCols, List bucketCols, List sortCols, int numBuckets, + String charset, String fieldDelim, String fieldEscape, String collItemDelim, String mapKeyDelim, String lineDelim, String comment, String inputFormat, String outputFormat, String location, String serName, @@ -100,6 +103,7 @@ this.collItemDelim = collItemDelim; this.cols = new ArrayList(cols); this.comment = comment; + this.charset = charset; this.fieldDelim = fieldDelim; this.fieldEscape = fieldEscape; this.inputFormat = inputFormat; @@ -183,6 +187,15 @@ this.numBuckets = numBuckets; } + @Explain(displayName = "charset") + public String getCharset() { + return charset; + } + + public void setCharset(String charset) { + this.charset = charset; + } + @Explain(displayName = "field delimiter") public String getFieldDelim() { return fieldDelim; @@ -361,4 +374,5 @@ public void setPrimaryRegionName(String primaryRegionName) { this.primaryRegionName = primaryRegionName; } + } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g (revision 1307724) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g (working copy) @@ -71,6 +71,8 @@ TOK_SERDE; TOK_SERDENAME; TOK_SERDEPROPS; +TOK_ROWFORMAT; +TOK_CHARSET; TOK_EXPLIST; TOK_ALIASLIST; TOK_GROUPBY; @@ -1078,7 +1080,8 @@ @init { msgs.push("serde specification"); } @after { msgs.pop(); } : rowFormatSerde -> ^(TOK_SERDE rowFormatSerde) - | rowFormatDelimited -> ^(TOK_SERDE rowFormatDelimited) + | KW_ROW KW_FORMAT rowFormatCharset? rowFormatDelimited? + -> ^(TOK_SERDE ^(TOK_ROWFORMAT rowFormatCharset? rowFormatDelimited?) ) | -> ^(TOK_SERDE) ; @@ -1107,16 +1110,24 @@ @init { msgs.push("serde properties specification"); } @after { msgs.pop(); } : - KW_ROW KW_FORMAT KW_DELIMITED tableRowFormatFieldIdentifier? tableRowFormatCollItemsIdentifier? tableRowFormatMapKeysIdentifier? tableRowFormatLinesIdentifier? + KW_DELIMITED tableRowFormatFieldIdentifier? tableRowFormatCollItemsIdentifier? tableRowFormatMapKeysIdentifier? tableRowFormatLinesIdentifier? -> ^(TOK_SERDEPROPS tableRowFormatFieldIdentifier? tableRowFormatCollItemsIdentifier? tableRowFormatMapKeysIdentifier? tableRowFormatLinesIdentifier?) ; +rowFormatCharset +@init { msgs.push("table charset"); } +@after { msgs.pop(); } + : + KW_CHARSET charset=StringLiteral + -> ^(TOK_CHARSET $charset) + ; + tableRowFormat @init { msgs.push("table row format specification"); } @after { msgs.pop(); } : - rowFormatDelimited - -> ^(TOK_TABLEROWFORMAT rowFormatDelimited) + KW_ROW KW_FORMAT rowFormatCharset? rowFormatDelimited? + -> ^(TOK_TABLEROWFORMAT rowFormatCharset? rowFormatDelimited?) | rowFormatSerde -> ^(TOK_TABLESERIALIZER rowFormatSerde) ; @@ -2228,6 +2239,7 @@ KW_BUCKETS: 'BUCKETS'; KW_ROW: 'ROW'; KW_FORMAT: 'FORMAT'; +KW_CHARSET: 'CHARSET'; KW_DELIMITED: 'DELIMITED'; KW_FIELDS: 'FIELDS'; KW_TERMINATED: 'TERMINATED'; Index: ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java (revision 1307724) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java (working copy) @@ -102,6 +102,7 @@ table.getSd().getBucketCols(), table.getSd().getSortCols(), table.getSd().getNumBuckets(), + null, // charset passed as serde params null, null, null, null, null, // these 5 delims passed as serde params null, // comment passed as table params table.getSd().getInputFormat(), Index: ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java (revision 1307724) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java (working copy) @@ -111,9 +111,28 @@ String collItemDelim = null; String mapKeyDelim = null; String lineDelim = null; + String charset = null; - protected void analyzeRowFormat(AnalyzeCreateCommonVars shared, ASTNode child) throws SemanticException { - child = (ASTNode) child.getChild(0); + protected void analyzeRowFormat(AnalyzeCreateCommonVars shared, ASTNode ast) throws SemanticException { + + int numCh = ast.getChildCount(); + for (int num = 0; num < numCh; num++){ + ASTNode child = (ASTNode) ast.getChild(num); + + if (child.getToken().getType() == HiveParser.TOK_SERDEPROPS){ + analyzeRowFormatDelimited(shared, child); + } + else if(child.getToken().getType() == HiveParser.TOK_CHARSET){ + charset = unescapeSQLString(child.getChild(0).getText()); + } + else{ + assert false; + } + } + } + + protected void analyzeRowFormatDelimited(AnalyzeCreateCommonVars shared, ASTNode child) throws SemanticException { + int numChildRowFormat = child.getChildCount(); for (int numC = 0; numC < numChildRowFormat; numC++) { ASTNode rowChild = (ASTNode) child.getChild(numC); Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1308021) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -1724,48 +1724,57 @@ } } return tblDesc; - } else if (child.getType() == HiveParser.TOK_SERDEPROPS) { + } else if (child.getType() == HiveParser.TOK_ROWFORMAT){ TableDesc tblDesc = PlanUtils.getDefaultTableDesc(Integer .toString(Utilities.ctrlaCode), cols, colTypes, defaultCols); int numChildRowFormat = child.getChildCount(); for (int numC = 0; numC < numChildRowFormat; numC++) { ASTNode rowChild = (ASTNode) child.getChild(numC); - switch (rowChild.getToken().getType()) { - case HiveParser.TOK_TABLEROWFORMATFIELD: - String fieldDelim = unescapeSQLString(rowChild.getChild(0).getText()); + if (rowChild.getToken().getType() == HiveParser.TOK_CHARSET) { + String charset = unescapeSQLString(rowChild.getChild(0).getText()); tblDesc.getProperties() - .setProperty(Constants.FIELD_DELIM, fieldDelim); - tblDesc.getProperties().setProperty(Constants.SERIALIZATION_FORMAT, - fieldDelim); + .setProperty(Constants.CHARSET, charset); + } else if (rowChild.getToken().getType() == HiveParser.TOK_SERDEPROPS) { + int numDelimited = rowChild.getChildCount(); + for (int numD = 0; numD < numDelimited; numD++) { + ASTNode delimitedChild = (ASTNode) rowChild.getChild(numD); + switch (delimitedChild.getToken().getType()) { + case HiveParser.TOK_TABLEROWFORMATFIELD: + String fieldDelim = unescapeSQLString(delimitedChild.getChild(0).getText()); + tblDesc.getProperties() + .setProperty(Constants.FIELD_DELIM, fieldDelim); + tblDesc.getProperties().setProperty(Constants.SERIALIZATION_FORMAT, + fieldDelim); - if (rowChild.getChildCount() >= 2) { - String fieldEscape = unescapeSQLString(rowChild.getChild(1) - .getText()); - tblDesc.getProperties().setProperty(Constants.ESCAPE_CHAR, - fieldEscape); + if (delimitedChild.getChildCount() >= 2) { + String fieldEscape = unescapeSQLString(delimitedChild.getChild(1) + .getText()); + tblDesc.getProperties().setProperty(Constants.ESCAPE_CHAR, + fieldEscape); + } + break; + case HiveParser.TOK_TABLEROWFORMATCOLLITEMS: + tblDesc.getProperties().setProperty(Constants.COLLECTION_DELIM, + unescapeSQLString(delimitedChild.getChild(0).getText())); + break; + case HiveParser.TOK_TABLEROWFORMATMAPKEYS: + tblDesc.getProperties().setProperty(Constants.MAPKEY_DELIM, + unescapeSQLString(delimitedChild.getChild(0).getText())); + break; + case HiveParser.TOK_TABLEROWFORMATLINES: + String lineDelim = unescapeSQLString(delimitedChild.getChild(0).getText()); + tblDesc.getProperties().setProperty(Constants.LINE_DELIM, lineDelim); + if (!lineDelim.equals("\n") && !lineDelim.equals("10")) { + throw new SemanticException(generateErrorMessage(delimitedChild, + ErrorMsg.LINES_TERMINATED_BY_NON_NEWLINE.getMsg())); + } + break; + default: + assert false; + } } - break; - case HiveParser.TOK_TABLEROWFORMATCOLLITEMS: - tblDesc.getProperties().setProperty(Constants.COLLECTION_DELIM, - unescapeSQLString(rowChild.getChild(0).getText())); - break; - case HiveParser.TOK_TABLEROWFORMATMAPKEYS: - tblDesc.getProperties().setProperty(Constants.MAPKEY_DELIM, - unescapeSQLString(rowChild.getChild(0).getText())); - break; - case HiveParser.TOK_TABLEROWFORMATLINES: - String lineDelim = unescapeSQLString(rowChild.getChild(0).getText()); - tblDesc.getProperties().setProperty(Constants.LINE_DELIM, lineDelim); - if (!lineDelim.equals("\n") && !lineDelim.equals("10")) { - throw new SemanticException(generateErrorMessage(rowChild, - ErrorMsg.LINES_TERMINATED_BY_NON_NEWLINE.getMsg())); - } - break; - default: - assert false; } } - return tblDesc; } @@ -8117,7 +8126,8 @@ tblProps = addDefaultProperties(tblProps); crtTblDesc = new CreateTableDesc(tableName, isExt, cols, partCols, - bucketCols, sortCols, numBuckets, rowFormatParams.fieldDelim, rowFormatParams.fieldEscape, + bucketCols, sortCols, numBuckets, + rowFormatParams.charset, rowFormatParams.fieldDelim, rowFormatParams.fieldEscape, rowFormatParams.collItemDelim, rowFormatParams.mapKeyDelim, rowFormatParams.lineDelim, comment, storageFormat.inputFormat, storageFormat.outputFormat, location, shared.serde, storageFormat.storageHandler, shared.serdeProps, @@ -8161,7 +8171,8 @@ tblProps = addDefaultProperties(tblProps); crtTblDesc = new CreateTableDesc(databaseName, tableName, isExt, cols, partCols, - bucketCols, sortCols, numBuckets, rowFormatParams.fieldDelim, rowFormatParams.fieldEscape, + bucketCols, sortCols, numBuckets, + rowFormatParams.charset, rowFormatParams.fieldDelim, rowFormatParams.fieldEscape, rowFormatParams.collItemDelim, rowFormatParams.mapKeyDelim, rowFormatParams.lineDelim, comment, storageFormat.inputFormat, storageFormat.outputFormat, location, shared.serde, storageFormat.storageHandler, shared.serdeProps, tblProps, ifNotExists, Index: ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java (revision 1307724) +++ ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java (working copy) @@ -187,6 +187,7 @@ overriddenConfigurations.putAll(HiveConf.getConfSystemProperties()); // Register the Hive builtins jar and all of its functions + /* try { Class pluginClass = Utilities.getBuiltinUtilsClass(); URL jarLocation = pluginClass.getProtectionDomain().getCodeSource() @@ -198,6 +199,7 @@ } catch (Exception ex) { throw new RuntimeException("Failed to load Hive builtin functions", ex); } + */ } public void setCmd(String cmdString) { Index: ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java (revision 1307724) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java (working copy) @@ -3395,6 +3395,9 @@ tbl.setSerializationLib(crtTbl.getSerName()); } + if (crtTbl.getCharset() != null) { + tbl.setSerdeParam("charset", crtTbl.getCharset()); + } if (crtTbl.getFieldDelim() != null) { tbl.setSerdeParam(Constants.FIELD_DELIM, crtTbl.getFieldDelim()); tbl.setSerdeParam(Constants.SERIALIZATION_FORMAT, crtTbl.getFieldDelim());