diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 69f0265..dc2f3ad 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -211,6 +211,12 @@ public class HiveConf extends Configuration { // not specified DROPIGNORESNONEXISTENT("hive.exec.drop.ignorenonexistent", true), + // By default enum types are treated as struct as they are classes + // with a single int field named value. However, users often refer to Enum + // values by their string name. Enabling this option converts enum values + // to strings at runtime. + CONVERT_ENUM_TO_STRING("hive.data.convert.enum.to.string", false), + // Hadoop Configuration Properties // Properties with null values are ignored and exist only for the purpose of giving us // a symbolic name to reference in the Hive source code. Properties with non-null diff --git ql/src/test/queries/clientpositive/convert_enum_to_string.q ql/src/test/queries/clientpositive/convert_enum_to_string.q new file mode 100644 index 0000000..f085cb2 --- /dev/null +++ ql/src/test/queries/clientpositive/convert_enum_to_string.q @@ -0,0 +1,17 @@ +-- Ensure the default behavior displays Enum fields as struct +-- and that enum-to-string conversion can be enabled/disabled. + +create table convert_enum_to_string + partitioned by (b string) + row format serde "org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer" + with serdeproperties ( + "serialization.class"="org.apache.hadoop.hive.serde2.thrift.test.Complex", + "serialization.format"="org.apache.thrift.protocol.TBinaryProtocol"); + +describe convert_enum_to_string; + +set hive.data.convert.enum.to.string=true; +describe convert_enum_to_string; + +set hive.data.convert.enum.to.string=false; +describe convert_enum_to_string; diff --git ql/src/test/results/clientpositive/convert_enum_to_string.q.out ql/src/test/results/clientpositive/convert_enum_to_string.q.out new file mode 100644 index 0000000..e970ac8 --- /dev/null +++ ql/src/test/results/clientpositive/convert_enum_to_string.q.out @@ -0,0 +1,57 @@ +PREHOOK: query: -- Ensure the default behavior displays Enum fields as struct +-- and that enum-to-string conversion can be enabled/disabled. + +create table convert_enum_to_string + partitioned by (b string) + row format serde "org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer" + with serdeproperties ( + "serialization.class"="org.apache.hadoop.hive.serde2.thrift.test.Complex", + "serialization.format"="org.apache.thrift.protocol.TBinaryProtocol") +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- Ensure the default behavior displays Enum fields as struct +-- and that enum-to-string conversion can be enabled/disabled. + +create table convert_enum_to_string + partitioned by (b string) + row format serde "org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer" + with serdeproperties ( + "serialization.class"="org.apache.hadoop.hive.serde2.thrift.test.Complex", + "serialization.format"="org.apache.thrift.protocol.TBinaryProtocol") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@convert_enum_to_string +PREHOOK: query: describe convert_enum_to_string +PREHOOK: type: DESCTABLE +POSTHOOK: query: describe convert_enum_to_string +POSTHOOK: type: DESCTABLE +aint int from deserializer +astring string from deserializer +lint array from deserializer +lstring array from deserializer +lintstring array> from deserializer +mstringstring map from deserializer +myenum struct from deserializer +b string +PREHOOK: query: describe convert_enum_to_string +PREHOOK: type: DESCTABLE +POSTHOOK: query: describe convert_enum_to_string +POSTHOOK: type: DESCTABLE +aint int from deserializer +astring string from deserializer +lint array from deserializer +lstring array from deserializer +lintstring array> from deserializer +mstringstring map from deserializer +myenum string from deserializer +b string +PREHOOK: query: describe convert_enum_to_string +PREHOOK: type: DESCTABLE +POSTHOOK: query: describe convert_enum_to_string +POSTHOOK: type: DESCTABLE +aint int from deserializer +astring string from deserializer +lint array from deserializer +lstring array from deserializer +lintstring array> from deserializer +mstringstring map from deserializer +myenum struct from deserializer +b string diff --git serde/if/test/complex.thrift serde/if/test/complex.thrift index 308b64c..d4d0317 100644 --- serde/if/test/complex.thrift +++ serde/if/test/complex.thrift @@ -18,6 +18,11 @@ namespace java org.apache.hadoop.hive.serde2.thrift.test +enum MyEnum { + ALPACA = 1, + LLAMA = 2 +} + struct IntString { 1: i32 myint; 2: string myString; @@ -31,4 +36,5 @@ struct Complex { 4: list lString; 5: list lintString; 6: map mStringString; + 7: MyEnum myEnum; } diff --git serde/ivy.xml serde/ivy.xml index ab4ac30..060e71c 100644 --- serde/ivy.xml +++ serde/ivy.xml @@ -39,6 +39,7 @@ + lString; // required private List lintString; // required private Map mStringString; // required + private MyEnum myEnum; // required /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */ public enum _Fields implements org.apache.thrift.TFieldIdEnum { @@ -44,7 +46,12 @@ public class Complex implements org.apache.thrift.TBase byName = new HashMap(); @@ -71,6 +78,8 @@ public class Complex implements org.apache.thrift.TBase lint, List lString, List lintString, - Map mStringString) + Map mStringString, + MyEnum myEnum) { this(); this.aint = aint; @@ -157,6 +169,7 @@ public class Complex implements org.apache.thrift.TBase objectInspectorCache = new HashMap(); + private static HashMap objectInspectorCache = + new HashMap(); public static ObjectInspector getReflectionObjectInspector(Type t, ObjectInspectorOptions options) { - ObjectInspector oi = objectInspectorCache.get(t); + return getReflectionObjectInspector(t, options, null); + } + + public static ObjectInspector getReflectionObjectInspector(Type t, + ObjectInspectorOptions options, Properties properties) { + ObjectInspector oi = objectInspectorCache.get(new MultiKey(t, properties)); if (oi == null) { - oi = getReflectionObjectInspectorNoCache(t, options); - objectInspectorCache.put(t, oi); + oi = getReflectionObjectInspectorNoCache(t, options, properties); + objectInspectorCache.put(new MultiKey(t, properties), oi); } verifyObjectInspector(options, oi, ObjectInspectorOptions.JAVA, new Class[]{ThriftStructObjectInspector.class, ProtocolBuffersStructObjectInspector.class}); @@ -101,7 +110,7 @@ public final class ObjectInspectorFactory { } private static ObjectInspector getReflectionObjectInspectorNoCache(Type t, - ObjectInspectorOptions options) { + ObjectInspectorOptions options, Properties properties) { if (t instanceof GenericArrayType) { GenericArrayType at = (GenericArrayType) t; return getStandardListObjectInspector(getReflectionObjectInspector(at @@ -155,6 +164,16 @@ public final class ObjectInspectorFactory { .getTypeEntryFromPrimitiveWritableClass(c).primitiveCategory); } + // Enum class? + if (properties != null && + Boolean.parseBoolean(properties.getProperty( + HiveConf.ConfVars.CONVERT_ENUM_TO_STRING.toString(), + HiveConf.ConfVars.CONVERT_ENUM_TO_STRING.defaultVal)) && + Enum.class.isAssignableFrom(c)) { + return PrimitiveObjectInspectorFactory + .getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING); + } + // Must be struct because List and Map need to be ParameterizedType assert (!List.class.isAssignableFrom(c)); assert (!Map.class.isAssignableFrom(c)); @@ -177,14 +196,14 @@ public final class ObjectInspectorFactory { } // put it into the cache BEFORE it is initialized to make sure we can catch // recursive types. - objectInspectorCache.put(t, oi); + objectInspectorCache.put(new MultiKey(t, properties), oi); Field[] fields = ObjectInspectorUtils.getDeclaredNonStaticFields(c); ArrayList structFieldObjectInspectors = new ArrayList( fields.length); for (int i = 0; i < fields.length; i++) { if (!oi.shouldIgnoreField(fields[i].getName())) { structFieldObjectInspectors.add(getReflectionObjectInspector(fields[i] - .getGenericType(), options)); + .getGenericType(), options, properties)); } } oi.init(c, structFieldObjectInspectors); diff --git serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java index 921ce2b..b496ca5 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/JavaStringObjectInspector.java @@ -32,12 +32,28 @@ public class JavaStringObjectInspector extends @Override public Text getPrimitiveWritableObject(Object o) { - return o == null ? null : new Text(((String) o)); + if (o == null) { + return null; + } + + if (Enum.class.isAssignableFrom(o.getClass())) { + return new Text(o.toString()); + } else { + return new Text((String) o); + } } @Override public String getPrimitiveJavaObject(Object o) { - return (String) o; + if (o == null) { + return null; + } + + if (Enum.class.isAssignableFrom(o.getClass())) { + return o.toString(); + } else { + return (String) o; + } } @Override diff --git serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftByteStreamTypedSerDe.java serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftByteStreamTypedSerDe.java index d98c5fb..8815f5c 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftByteStreamTypedSerDe.java +++ serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftByteStreamTypedSerDe.java @@ -22,8 +22,10 @@ import java.lang.reflect.Type; import java.util.Properties; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde2.ByteStreamTypedSerDe; import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.io.Writable; import org.apache.thrift.TBase; @@ -37,6 +39,7 @@ import org.apache.thrift.transport.TIOStreamTransport; */ public class ThriftByteStreamTypedSerDe extends ByteStreamTypedSerDe { + protected Properties objectInspectorProperties = new Properties(); protected TIOStreamTransport outTransport, inTransport; protected TProtocol outProtocol, inProtocol; @@ -56,9 +59,12 @@ public class ThriftByteStreamTypedSerDe extends ByteStreamTypedSerDe { } public ThriftByteStreamTypedSerDe(Type objectType, - TProtocolFactory inFactory, TProtocolFactory outFactory) + TProtocolFactory inFactory, TProtocolFactory outFactory, Configuration conf) throws SerDeException { super(objectType); + objectInspectorProperties.setProperty(HiveConf.ConfVars.CONVERT_ENUM_TO_STRING.toString(), + conf.get(HiveConf.ConfVars.CONVERT_ENUM_TO_STRING.toString(), + HiveConf.ConfVars.CONVERT_ENUM_TO_STRING.defaultVal)); try { init(inFactory, outFactory); } catch (Exception e) { @@ -72,6 +78,12 @@ public class ThriftByteStreamTypedSerDe extends ByteStreamTypedSerDe { } @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return ObjectInspectorFactory.getReflectionObjectInspector(objectType, + getObjectInspectorOptions(), objectInspectorProperties); + } + + @Override public Object deserialize(Writable field) throws SerDeException { Object obj = super.deserialize(field); try { diff --git serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftDeserializer.java serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftDeserializer.java index e5696ab..e7aeed3 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftDeserializer.java +++ serde/src/java/org/apache/hadoop/hive/serde2/thrift/ThriftDeserializer.java @@ -61,7 +61,7 @@ public class ThriftDeserializer implements Deserializer { TProtocolFactory tp = TReflectionUtils .getProtocolFactoryByName(protoName); - tsd = new ThriftByteStreamTypedSerDe(recordClass, tp, tp); + tsd = new ThriftByteStreamTypedSerDe(recordClass, tp, tp, job); } catch (Exception e) { throw new SerDeException(e); diff --git serde/src/test/org/apache/hadoop/hive/serde2/thrift_test/CreateSequenceFile.java serde/src/test/org/apache/hadoop/hive/serde2/thrift_test/CreateSequenceFile.java index 8aef773..7b76fa2 100644 --- serde/src/test/org/apache/hadoop/hive/serde2/thrift_test/CreateSequenceFile.java +++ serde/src/test/org/apache/hadoop/hive/serde2/thrift_test/CreateSequenceFile.java @@ -1,145 +1,147 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.thrift_test; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Random; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.ByteStream; -import org.apache.hadoop.hive.serde2.thrift.test.Complex; -import org.apache.hadoop.hive.serde2.thrift.test.IntString; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.thrift.TBase; -import org.apache.thrift.TException; -import org.apache.thrift.protocol.TBinaryProtocol; -import org.apache.thrift.protocol.TProtocol; -import org.apache.thrift.protocol.TProtocolFactory; -import org.apache.thrift.transport.TIOStreamTransport; - -/** - * CreateSequenceFile. - * - */ -public final class CreateSequenceFile { - - private CreateSequenceFile() { - // prevent instantiation - } - - public static void usage() { - System.out.println("Usage: CreateSequenceFile "); - System.exit(1); - } - - /** - * ThriftSerializer. - * - */ - public static class ThriftSerializer { - - private ByteStream.Output bos; - private TProtocol outProtocol; - - public ThriftSerializer() { - bos = new ByteStream.Output(); - TIOStreamTransport outTransport = new TIOStreamTransport(bos); - TProtocolFactory outFactory = new TBinaryProtocol.Factory(); - outProtocol = outFactory.getProtocol(outTransport); - } - - private BytesWritable bw = new BytesWritable(); - - public BytesWritable serialize(TBase base) throws TException { - bos.reset(); - base.write(outProtocol); - bw.set(bos.getData(), 0, bos.getCount()); - return bw; - } - } - - public static void main(String[] args) throws Exception { - - // Read parameters - int lines = 10; - List extraArgs = new ArrayList(); - for (int ai = 0; ai < args.length; ai++) { - if (args[ai].equals("-line") && ai + 1 < args.length) { - lines = Integer.parseInt(args[ai + 1]); - ai++; - } else { - extraArgs.add(args[ai]); - } - } - if (extraArgs.size() != 1) { - usage(); - } - - JobConf conf = new JobConf(CreateSequenceFile.class); - - ThriftSerializer serializer = new ThriftSerializer(); - - // Open files - SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), - conf, new Path(extraArgs.get(0)), BytesWritable.class, - BytesWritable.class); - - // write to file - BytesWritable key = new BytesWritable(); - - Random rand = new Random(20081215); - - for (int i = 0; i < lines; i++) { - - ArrayList alist = new ArrayList(); - alist.add(i); - alist.add(i * 2); - alist.add(i * 3); - ArrayList slist = new ArrayList(); - slist.add("" + i * 10); - slist.add("" + i * 100); - slist.add("" + i * 1000); - ArrayList islist = new ArrayList(); - islist.add(new IntString(i * i, "" + i * i * i, i)); - HashMap hash = new HashMap(); - hash.put("key_" + i, "value_" + i); - - Complex complex = new Complex(rand.nextInt(), "record_" - + (new Integer(i)).toString(), alist, slist, islist, hash); - - Writable value = serializer.serialize(complex); - writer.append(key, value); - } - - // Add an all-null record - Complex complex = new Complex(0, null, null, null, null, null); - Writable value = serializer.serialize(complex); - writer.append(key, value); - - // Close files - writer.close(); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.thrift_test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.ByteStream; +import org.apache.hadoop.hive.serde2.thrift.test.Complex; +import org.apache.hadoop.hive.serde2.thrift.test.IntString; +import org.apache.hadoop.hive.serde2.thrift.test.MyEnum; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.thrift.TBase; +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.protocol.TProtocolFactory; +import org.apache.thrift.transport.TIOStreamTransport; + +/** + * CreateSequenceFile. + * + */ +public final class CreateSequenceFile { + + private CreateSequenceFile() { + // prevent instantiation + } + + public static void usage() { + System.out.println("Usage: CreateSequenceFile "); + System.exit(1); + } + + /** + * ThriftSerializer. + * + */ + public static class ThriftSerializer { + + private ByteStream.Output bos; + private TProtocol outProtocol; + + public ThriftSerializer() { + bos = new ByteStream.Output(); + TIOStreamTransport outTransport = new TIOStreamTransport(bos); + TProtocolFactory outFactory = new TBinaryProtocol.Factory(); + outProtocol = outFactory.getProtocol(outTransport); + } + + private BytesWritable bw = new BytesWritable(); + + public BytesWritable serialize(TBase base) throws TException { + bos.reset(); + base.write(outProtocol); + bw.set(bos.getData(), 0, bos.getCount()); + return bw; + } + } + + public static void main(String[] args) throws Exception { + + // Read parameters + int lines = 10; + List extraArgs = new ArrayList(); + for (int ai = 0; ai < args.length; ai++) { + if (args[ai].equals("-line") && ai + 1 < args.length) { + lines = Integer.parseInt(args[ai + 1]); + ai++; + } else { + extraArgs.add(args[ai]); + } + } + if (extraArgs.size() != 1) { + usage(); + } + + JobConf conf = new JobConf(CreateSequenceFile.class); + + ThriftSerializer serializer = new ThriftSerializer(); + + // Open files + SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), + conf, new Path(extraArgs.get(0)), BytesWritable.class, + BytesWritable.class); + + // write to file + BytesWritable key = new BytesWritable(); + + Random rand = new Random(20081215); + + for (int i = 0; i < lines; i++) { + + ArrayList alist = new ArrayList(); + alist.add(i); + alist.add(i * 2); + alist.add(i * 3); + ArrayList slist = new ArrayList(); + slist.add("" + i * 10); + slist.add("" + i * 100); + slist.add("" + i * 1000); + ArrayList islist = new ArrayList(); + islist.add(new IntString(i * i, "" + i * i * i, i)); + HashMap hash = new HashMap(); + hash.put("key_" + i, "value_" + i); + + Complex complex = new Complex(rand.nextInt(), "record_" + + (new Integer(i)).toString(), alist, slist, islist, hash, + MyEnum.findByValue(rand.nextInt(MyEnum.values().length) + 1 )); + + Writable value = serializer.serialize(complex); + writer.append(key, value); + } + + // Add an all-null record + Complex complex = new Complex(0, null, null, null, null, null, null); + Writable value = serializer.serialize(complex); + writer.append(key, value); + + // Close files + writer.close(); + } + +}