diff --git a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java index 4900a41..df8c70e 100644 --- a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java +++ b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseSerDe.java @@ -33,6 +33,7 @@ import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.ByteStream; +import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.SerDeUtils; @@ -68,8 +69,8 @@ public static final String HBASE_SCAN_CACHE = "hbase.scan.cache"; public static final String HBASE_SCAN_CACHEBLOCKS = "hbase.scan.cacheblock"; public static final String HBASE_SCAN_BATCH = "hbase.scan.batch"; - - /** Determines whether a regex matching should be done on the columns or not. Defaults to true. + + /** Determines whether a regex matching should be done on the columns or not. Defaults to true. * WARNING: Note that currently this only supports the suffix wildcard .* **/ public static final String HBASE_COLUMNS_REGEX_MATCHING = "hbase.columns.mapping.regex.matching"; @@ -126,7 +127,8 @@ public void initialize(Configuration conf, Properties tbl) serdeParams.getNullSequence(), serdeParams.isLastColumnTakesRest(), serdeParams.isEscaped(), - serdeParams.getEscapeChar()); + serdeParams.getEscapeChar(), + serdeParams.isBase64Encoding()); cachedHBaseRow = new LazyHBaseRow( (LazySimpleStructObjectInspector) cachedObjectInspector); diff --git a/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java b/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java index 28f8d6a..8ad4224 100644 --- a/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java +++ b/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java @@ -6,30 +6,8 @@ */ package org.apache.hadoop.hive.serde; -import org.apache.commons.lang.builder.HashCodeBuilder; -import org.apache.thrift.scheme.IScheme; -import org.apache.thrift.scheme.SchemeFactory; -import org.apache.thrift.scheme.StandardScheme; - -import org.apache.thrift.scheme.TupleScheme; -import org.apache.thrift.protocol.TTupleProtocol; -import org.apache.thrift.protocol.TProtocolException; -import org.apache.thrift.EncodingUtils; -import org.apache.thrift.TException; -import java.util.List; -import java.util.ArrayList; -import java.util.Map; -import java.util.HashMap; -import java.util.EnumMap; -import java.util.Set; import java.util.HashSet; -import java.util.EnumSet; -import java.util.Collections; -import java.util.BitSet; -import java.nio.ByteBuffer; -import java.util.Arrays; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.util.Set; public class serdeConstants { @@ -49,6 +27,8 @@ public static final String SERIALIZATION_USE_JSON_OBJECTS = "serialization.use.json.object"; + public static final String SERIALIZATION_USE_BASE64_ENCODING = "serialization.use.base64.encoding"; + public static final String FIELD_DELIM = "field.delim"; public static final String COLLECTION_DELIM = "colelction.delim"; diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java index 11f5f07..0ac1ec2 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java @@ -26,11 +26,9 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.lazy.LazyFactory; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; @@ -79,6 +77,7 @@ public ColumnarSerDe() throws SerDeException { * * @see SerDe#initialize(Configuration, Properties) */ + @Override public void initialize(Configuration job, Properties tbl) throws SerDeException { serdeParams = LazySimpleSerDe.initSerdeParams(job, tbl, getClass().getName()); @@ -88,7 +87,7 @@ public void initialize(Configuration job, Properties tbl) throws SerDeException cachedObjectInspector = LazyFactory.createColumnarStructInspector( serdeParams.getColumnNames(), serdeParams.getColumnTypes(), serdeParams .getSeparators(), serdeParams.getNullSequence(), serdeParams - .isEscaped(), serdeParams.getEscapeChar()); + .isEscaped(), serdeParams.getEscapeChar(), serdeParams.isBase64Encoding()); java.util.ArrayList notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(job); @@ -114,6 +113,7 @@ public void initialize(Configuration job, Properties tbl) throws SerDeException * @return The serialized Writable object * @see SerDe#serialize(Object, ObjectInspector) */ + @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (objInspector.getCategory() != Category.STRUCT) { diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyBinary.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyBinary.java index ae12f20..0a14ab9 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyBinary.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyBinary.java @@ -18,14 +18,11 @@ package org.apache.hadoop.hive.serde2.lazy; -import java.nio.charset.CharacterCodingException; - import org.apache.commons.codec.binary.Base64; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyBinaryObjectInspector; import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Text; public class LazyBinary extends LazyPrimitive { @@ -49,12 +46,12 @@ public void init(ByteArrayRef bytes, int start, int length) { byte[] recv = new byte[length]; System.arraycopy(bytes.getData(), start, recv, 0, length); - boolean arrayByteBase64 = Base64.isArrayByteBase64(recv); + /*boolean arrayByteBase64 = Base64.isArrayByteBase64(recv); if (arrayByteBase64) { LOG.debug("Data not contains valid characters within the Base64 alphabet so " + "decoded the data."); - } - byte[] decoded = arrayByteBase64 ? Base64.decodeBase64(recv) : recv; + }*/ + byte[] decoded = oi.isBase64Encoding() ? Base64.decodeBase64(recv) : recv; data.set(decoded, 0, decoded.length); } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java index a08b4a8..4872493 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyFactory.java @@ -27,18 +27,18 @@ import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyObjectInspectorFactory; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyUnionObjectInspector; -import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyHiveDecimalObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyBinaryObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyBooleanObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyByteObjectInspector; +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyDateObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyDoubleObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyFloatObjectInspector; +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyHiveDecimalObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyIntObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyShortObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector; -import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyDateObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyTimestampObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyVoidObjectInspector; import org.apache.hadoop.hive.serde2.lazydio.LazyDioBoolean; @@ -212,27 +212,34 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, byte escapeChar) throws SerDeException { + return createLazyObjectInspector(typeInfo, + separator, separatorIndex, nullSequence, escaped, escapeChar); + } + + public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, + byte[] separator, int separatorIndex, Text nullSequence, boolean escaped, + byte escapeChar, boolean isBase64Encoding) throws SerDeException { ObjectInspector.Category c = typeInfo.getCategory(); switch (c) { case PRIMITIVE: return LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector( ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory(), escaped, - escapeChar); + escapeChar, isBase64Encoding); case MAP: return LazyObjectInspectorFactory.getLazySimpleMapObjectInspector( createLazyObjectInspector(((MapTypeInfo) typeInfo) - .getMapKeyTypeInfo(), separator, separatorIndex + 2, - nullSequence, escaped, escapeChar), createLazyObjectInspector( - ((MapTypeInfo) typeInfo).getMapValueTypeInfo(), separator, - separatorIndex + 2, nullSequence, escaped, escapeChar), + .getMapKeyTypeInfo(), separator, separatorIndex + 2, + nullSequence, escaped, escapeChar, isBase64Encoding), createLazyObjectInspector( + ((MapTypeInfo) typeInfo).getMapValueTypeInfo(), separator, + separatorIndex + 2, nullSequence, escaped, escapeChar, isBase64Encoding), LazyUtils.getSeparator(separator, separatorIndex), - LazyUtils.getSeparator(separator, separatorIndex+1), + LazyUtils.getSeparator(separator, separatorIndex + 1), nullSequence, escaped, escapeChar); case LIST: return LazyObjectInspectorFactory.getLazySimpleListObjectInspector( createLazyObjectInspector(((ListTypeInfo) typeInfo) .getListElementTypeInfo(), separator, separatorIndex + 1, - nullSequence, escaped, escapeChar), LazyUtils.getSeparator(separator, separatorIndex), + nullSequence, escaped, escapeChar, isBase64Encoding), LazyUtils.getSeparator(separator, separatorIndex), nullSequence, escaped, escapeChar); case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; @@ -244,7 +251,7 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, for (int i = 0; i < fieldTypeInfos.size(); i++) { fieldObjectInspectors.add(createLazyObjectInspector(fieldTypeInfos .get(i), separator, separatorIndex + 1, nullSequence, escaped, - escapeChar)); + escapeChar, isBase64Encoding)); } return LazyObjectInspectorFactory.getLazySimpleStructObjectInspector( fieldNames, fieldObjectInspectors, @@ -256,7 +263,7 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, for (TypeInfo uti : unionTypeInfo.getAllUnionObjectTypeInfos()) { lazyOIs.add(createLazyObjectInspector(uti, separator, separatorIndex + 1, nullSequence, escaped, - escapeChar)); + escapeChar, isBase64Encoding)); } return LazyObjectInspectorFactory.getLazyUnionObjectInspector(lazyOIs, LazyUtils.getSeparator(separator, separatorIndex), @@ -277,24 +284,37 @@ public static ObjectInspector createLazyObjectInspector(TypeInfo typeInfo, * @see LazyFactory#createLazyObjectInspector(TypeInfo, byte[], int, Text, * boolean, byte) */ + public static ObjectInspector createLazyStructInspector( List columnNames, List typeInfos, byte[] separators, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, byte escapeChar) throws SerDeException { + return createLazyStructInspector( + columnNames, typeInfos, separators, + nullSequence, lastColumnTakesRest, escaped, + escapeChar, true); + } + + public static ObjectInspector createLazyStructInspector( + List columnNames, List typeInfos, byte[] separators, + Text nullSequence, boolean lastColumnTakesRest, boolean escaped, + byte escapeChar, boolean isBase64Encoding) throws SerDeException { ArrayList columnObjectInspectors = new ArrayList( typeInfos.size()); for (int i = 0; i < typeInfos.size(); i++) { columnObjectInspectors.add(LazyFactory.createLazyObjectInspector( - typeInfos.get(i), separators, 1, nullSequence, escaped, escapeChar)); + typeInfos.get(i), separators, 1, nullSequence, escaped, escapeChar, isBase64Encoding)); } return LazyObjectInspectorFactory.getLazySimpleStructObjectInspector( columnNames, columnObjectInspectors, separators[0], nullSequence, lastColumnTakesRest, escaped, escapeChar); } + /** * Create a hierarchical ObjectInspector for ColumnarStruct with the given * columnNames and columnTypeInfos. + * @param isBase64Encoding * @throws SerDeException * * @see LazyFactory#createLazyObjectInspector(TypeInfo, byte[], int, Text, @@ -302,13 +322,13 @@ public static ObjectInspector createLazyStructInspector( */ public static ObjectInspector createColumnarStructInspector( List columnNames, List columnTypes, byte[] separators, - Text nullSequence, boolean escaped, byte escapeChar) throws SerDeException { + Text nullSequence, boolean escaped, byte escapeChar, boolean isBase64Encoding) throws SerDeException { ArrayList columnObjectInspectors = new ArrayList( columnTypes.size()); for (int i = 0; i < columnTypes.size(); i++) { columnObjectInspectors .add(LazyFactory.createLazyObjectInspector(columnTypes.get(i), - separators, 1, nullSequence, escaped, escapeChar)); + separators, 1, nullSequence, escaped, escapeChar, isBase64Encoding)); } return ObjectInspectorFactory.getColumnarStructObjectInspector(columnNames, columnObjectInspectors); diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java index 606208c..6c437f4 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.java @@ -128,6 +128,7 @@ public static byte getByte(String altValue, byte defaultVal) { boolean escaped; byte escapeChar; boolean[] needsEscape; + boolean base64Encoding; public List getColumnTypes() { return columnTypes; @@ -168,6 +169,15 @@ public byte getEscapeChar() { public boolean[] getNeedsEscape() { return needsEscape; } + + public boolean isBase64Encoding() { + return base64Encoding; + } + + public void setBase64Encoding(boolean base64Encoding) { + this.base64Encoding = base64Encoding; + } + } SerDeParameters serdeParams = null; @@ -192,7 +202,7 @@ public void initialize(Configuration job, Properties tbl) .getColumnNames(), serdeParams.getColumnTypes(), serdeParams .getSeparators(), serdeParams.getNullSequence(), serdeParams .isLastColumnTakesRest(), serdeParams.isEscaped(), serdeParams - .getEscapeChar()); + .getEscapeChar(), serdeParams.isBase64Encoding()); cachedLazyStruct = (LazyStruct) LazyFactory .createLazyObject(cachedObjectInspector); @@ -307,6 +317,12 @@ public static SerDeParameters initSerdeParams(Configuration job, serdeParams.needsEscape[serdeParams.separators[i]] = true; } } + String strBase64 = tbl.getProperty(serdeConstants.SERIALIZATION_USE_BASE64_ENCODING, "FALSE"); + if (strBase64.equalsIgnoreCase("TRUE")) { + serdeParams.base64Encoding = true; + } else { + serdeParams.base64Encoding = false; + } return serdeParams; } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyBinaryObjectInspector.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyBinaryObjectInspector.java index dbd60f7..bb13e11 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyBinaryObjectInspector.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyBinaryObjectInspector.java @@ -27,9 +27,15 @@ public class LazyBinaryObjectInspector extends AbstractPrimitiveLazyObjectInspector implements BinaryObjectInspector { + protected boolean base64Encoding = true; public LazyBinaryObjectInspector() { + this(true); + } + + public LazyBinaryObjectInspector(boolean _base64Encoding) { super(PrimitiveObjectInspectorUtils.binaryTypeEntry); + this.base64Encoding = _base64Encoding; } @Override @@ -49,4 +55,13 @@ public Object copyObject(Object o) { public BytesWritable getPrimitiveWritableObject(Object o) { return null == o ? null : ((LazyBinary) o).getWritableObject(); } + + public boolean isBase64Encoding() { + return base64Encoding; + } + + public void setBase64Encoding(boolean base64Encoding) { + this.base64Encoding = base64Encoding; + } + } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java index afbf454..a8739b4 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/primitive/LazyPrimitiveObjectInspectorFactory.java @@ -58,7 +58,9 @@ public static final LazyTimestampObjectInspector LAZY_TIMESTAMP_OBJECT_INSPECTOR = new LazyTimestampObjectInspector(); public static final LazyBinaryObjectInspector LAZY_BINARY_OBJECT_INSPECTOR = - new LazyBinaryObjectInspector(); + new LazyBinaryObjectInspector(false); + public static final LazyBinaryObjectInspector LAZY_BINARY_OBJECT_INSPECTOR_BASE64_ENCODING = + new LazyBinaryObjectInspector(true); public static final LazyHiveDecimalObjectInspector LAZY_BIG_DECIMAL_OBJECT_INSPECTOR = new LazyHiveDecimalObjectInspector(); @@ -80,8 +82,7 @@ public static LazyStringObjectInspector getLazyStringObjectInspector( } public static AbstractPrimitiveLazyObjectInspector getLazyObjectInspector( - PrimitiveCategory primitiveCategory, boolean escaped, byte escapeChar) { - + PrimitiveCategory primitiveCategory, boolean escaped, byte escapeChar, boolean isBase64Encoding) { switch (primitiveCategory) { case BOOLEAN: return LAZY_BOOLEAN_OBJECT_INSPECTOR; @@ -100,7 +101,11 @@ public static LazyStringObjectInspector getLazyStringObjectInspector( case STRING: return getLazyStringObjectInspector(escaped, escapeChar); case BINARY: - return LAZY_BINARY_OBJECT_INSPECTOR; + if (isBase64Encoding) { + return LAZY_BINARY_OBJECT_INSPECTOR_BASE64_ENCODING; + } else { + return LAZY_BINARY_OBJECT_INSPECTOR; + } case VOID: return LAZY_VOID_OBJECT_INSPECTOR; case DATE: