commit ea9fef597b7c000c702514b6f413fbaf5c64e466 Author: Vihang Karajgaonkar Date: Tue Feb 27 21:32:47 2018 -0800 Moved TypeInfos to standalone-metastore Introduce Avro storage schema reader diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/SerDeStorageSchemaReader.java b/metastore/src/java/org/apache/hadoop/hive/metastore/SerDeStorageSchemaReader.java index 59bcd5ca34d5083d357d7157abf3682399060a1a..05e10784a4ac075a458db30976d047447b12d7a0 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/SerDeStorageSchemaReader.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/SerDeStorageSchemaReader.java @@ -27,6 +27,10 @@ import java.util.List; +/** + * In order to use this Storage schema reader you should add the hive-serde jar in the classpath + * of the metastore. + */ public class SerDeStorageSchemaReader implements StorageSchemaReader { @Override public List readSchema(Table tbl, EnvironmentContext envContext, Configuration conf) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java index 89e84127a8aeb4a603df7a9cf7b7da2125a7949b..e1710fda574904b6f1ff7a240b771c174e4e1e5d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MetastoreTypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -149,7 +150,7 @@ private ParseUtils() { public static ExprNodeDesc createConversionCast(ExprNodeDesc column, PrimitiveTypeInfo tableFieldTypeInfo) throws SemanticException { // Get base type, since type string may be parameterized - String baseType = TypeInfoUtils.getBaseName(tableFieldTypeInfo.getTypeName()); + String baseType = MetastoreTypeInfoUtils.getBaseName(tableFieldTypeInfo.getTypeName()); // If the type cast UDF is for a parameterized type, then it should implement // the SettableUDF interface so that we can pass in the params. diff --git a/ql/src/test/results/clientnegative/avro_decimal.q.out b/ql/src/test/results/clientnegative/avro_decimal.q.out index 9d00d6ee322c1016feaac2a829e670694a3aab6d..046b7821a35b3ddac2fbe83742120e6a13b99b15 100644 --- a/ql/src/test/results/clientnegative/avro_decimal.q.out +++ b/ql/src/test/results/clientnegative/avro_decimal.q.out @@ -19,4 +19,4 @@ TBLPROPERTIES ( PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@avro_dec -FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Invalid precision or scale for decimal type) +FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.IllegalArgumentException: Decimal precision out of allowed range [1,38] diff --git a/serde/pom.xml b/serde/pom.xml index 0247c32452180aad73eb1932110096c1d044bd15..b39c6b7c91d66f83ee75a4f85081bdab679fd91c 100644 --- a/serde/pom.xml +++ b/serde/pom.xml @@ -49,6 +49,11 @@ hive-shims ${project.version} + + org.apache.hive + hive-standalone-metastore + ${project.version} + com.google.code.findbugs diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java index 085835f8e57788122801592a99ac42eb68211342..1846fba9d0736518b94823b17e2281e885cdc903 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/SerDeUtils.java @@ -24,7 +24,6 @@ import java.util.Properties; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroObjectInspectorGenerator.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroObjectInspectorGenerator.java index a07cf09d11dc55262cdbac7ad45862d3cf1783a1..884f1ec4cb8c52fab4dd8a696ee03edd90bc3071 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroObjectInspectorGenerator.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroObjectInspectorGenerator.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; /** @@ -50,7 +51,11 @@ public AvroObjectInspectorGenerator(Schema schema) throws SerDeException { verifySchemaIsARecord(schema); this.columnNames = AvroObjectInspectorGenerator.generateColumnNames(schema); - this.columnTypes = SchemaToTypeInfo.generateColumnTypes(schema); + try { + this.columnTypes = SchemaToHiveTypeInfo.getInstance().generateColumnTypes(schema); + } catch (Exception e) { + throw new AvroSerdeException(e.getMessage()); + } this.columnComments = AvroObjectInspectorGenerator.generateColumnComments(schema); assert columnNames.size() == columnTypes.size(); this.oi = createObjectInspector(); @@ -139,7 +144,7 @@ private ObjectInspector createObjectInspectorWorker(TypeInfo ti) throws SerDeExc } private boolean supportedCategories(TypeInfo ti) { - final ObjectInspector.Category c = ti.getCategory(); + final Category c = ti.getCategory(); return c.equals(ObjectInspector.Category.PRIMITIVE) || c.equals(ObjectInspector.Category.MAP) || c.equals(ObjectInspector.Category.LIST) || diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java index 83e5d6822e6cd78f0bbc15266a78a36f18c7b769..e2043f63a5351cc9353fb772f7fdb832f0dcebf3 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java @@ -165,7 +165,11 @@ protected GenericEnumSymbol makeInstance(Object seed, }; private Object serializeEnum(TypeInfo typeInfo, PrimitiveObjectInspector fieldOI, Object structFieldData, Schema schema) throws AvroSerdeException { - return enums.retrieve(schema).retrieve(serializePrimitive(typeInfo, fieldOI, structFieldData, schema)); + try { + return enums.retrieve(schema).retrieve(serializePrimitive(typeInfo, fieldOI, structFieldData, schema)); + } catch (Exception e) { + throw new AvroSerdeException(e); + } } private Object serializeStruct(StructTypeInfo typeInfo, StructObjectInspector ssoi, Object o, Schema schema) throws AvroSerdeException { diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/InstanceCache.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/InstanceCache.java deleted file mode 100644 index 2d5202091505990f46586b67f0243193db2a2253..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/InstanceCache.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.serde2.avro; - -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Cache for objects whose creation only depends on some other set of objects and therefore can be - * used against other equivalent versions of those objects. Essentially memoizes instance creation. - * - * @param Object that determines the instance. The cache uses this object as a key for - * its hash which is why it is imperative to have appropriate equals and hashcode - * implementation for this object for the cache to work properly - * @param Instance that will be created from SeedObject. - */ -public abstract class InstanceCache { - private static final Logger LOG = LoggerFactory.getLogger(InstanceCache.class); - Map cache = new HashMap(); - - public InstanceCache() {} - - /** - * Retrieve (or create if it doesn't exist) the correct Instance for this - * SeedObject - */ - public Instance retrieve(SeedObject hv) throws AvroSerdeException { - return retrieve(hv, null); - } - - /** - * Retrieve (or create if it doesn't exist) the correct Instance for this - * SeedObject using 'seenSchemas' to resolve circular references - */ - public synchronized Instance retrieve(SeedObject hv, - Set seenSchemas) throws AvroSerdeException { - if(LOG.isDebugEnabled()) LOG.debug("Checking for hv: " + hv.toString()); - - if(cache.containsKey(hv)) { - if(LOG.isDebugEnabled()) LOG.debug("Returning cache result."); - return cache.get(hv); - } - - if(LOG.isDebugEnabled()) LOG.debug("Creating new instance and storing in cache"); - - Instance instance = makeInstance(hv, seenSchemas); - cache.put(hv, instance); - return instance; - } - - protected abstract Instance makeInstance(SeedObject hv, - Set seenSchemas) throws AvroSerdeException; -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaResolutionProblem.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaResolutionProblem.java deleted file mode 100644 index 65f104dcaa37f9fff17ad1a8e690d17041cf7f06..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaResolutionProblem.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.serde2.avro; - -import org.apache.avro.Schema; - -class SchemaResolutionProblem { - static final String sentinelString = "{\n" + - " \"namespace\": \"org.apache.hadoop.hive\",\n" + - " \"name\": \"CannotDetermineSchemaSentinel\",\n" + - " \"type\": \"record\",\n" + - " \"fields\": [\n" + - " {\n" + - " \"name\":\"ERROR_ERROR_ERROR_ERROR_ERROR_ERROR_ERROR\",\n" + - " \"type\":\"string\"\n" + - " },\n" + - " {\n" + - " \"name\":\"Cannot_determine_schema\",\n" + - " \"type\":\"string\"\n" + - " },\n" + - " {\n" + - " \"name\":\"check\",\n" + - " \"type\":\"string\"\n" + - " },\n" + - " {\n" + - " \"name\":\"schema\",\n" + - " \"type\":\"string\"\n" + - " },\n" + - " {\n" + - " \"name\":\"url\",\n" + - " \"type\":\"string\"\n" + - " },\n" + - " {\n" + - " \"name\":\"and\",\n" + - " \"type\":\"string\"\n" + - " },\n" + - " {\n" + - " \"name\":\"literal\",\n" + - " \"type\":\"string\"\n" + - " }\n" + - " ]\n" + - "}"; - public final static Schema SIGNAL_BAD_SCHEMA = AvroSerdeUtils.getSchemaFor(sentinelString); -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToHiveTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToHiveTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..d82f5112ba9396f6a3370574a11adb1c908fbb26 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToHiveTypeInfo.java @@ -0,0 +1,18 @@ +package org.apache.hadoop.hive.serde2.avro; + +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +public class SchemaToHiveTypeInfo extends SchemaToTypeInfo { + private static final SchemaToHiveTypeInfo instance = new SchemaToHiveTypeInfo(); + + private SchemaToHiveTypeInfo() { + //use getInstance to get this object. The base class uses cache to reuse + //Types when available + super(TypeInfoFactory.getInstance()); + } + + public static final SchemaToHiveTypeInfo getInstance() { + return instance; + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java deleted file mode 100644 index 35d83bdb1af03df674842b72422e4b7d4d22b596..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.serde2.avro; - -import static org.apache.avro.Schema.Type.BOOLEAN; -import static org.apache.avro.Schema.Type.BYTES; -import static org.apache.avro.Schema.Type.DOUBLE; -import static org.apache.avro.Schema.Type.FIXED; -import static org.apache.avro.Schema.Type.FLOAT; -import static org.apache.avro.Schema.Type.INT; -import static org.apache.avro.Schema.Type.LONG; -import static org.apache.avro.Schema.Type.NULL; -import static org.apache.avro.Schema.Type.STRING; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Hashtable; -import java.util.IdentityHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.avro.Schema; -import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; - -/** - * Convert an Avro Schema to a Hive TypeInfo - */ -class SchemaToTypeInfo { - // Conversion of Avro primitive types to Hive primitive types - // Avro Hive - // Null - // boolean boolean check - // int int check - // long bigint check - // float double check - // double double check - // bytes binary check - // fixed binary check - // string string check - // tinyint - // smallint - - // Map of Avro's primitive types to Hives (for those that are supported by both) - private static final Map primitiveTypeToTypeInfo = initTypeMap(); - private static Map initTypeMap() { - Map theMap = new Hashtable(); - theMap.put(NULL, TypeInfoFactory.getPrimitiveTypeInfo("void")); - theMap.put(BOOLEAN, TypeInfoFactory.getPrimitiveTypeInfo("boolean")); - theMap.put(INT, TypeInfoFactory.getPrimitiveTypeInfo("int")); - theMap.put(LONG, TypeInfoFactory.getPrimitiveTypeInfo("bigint")); - theMap.put(FLOAT, TypeInfoFactory.getPrimitiveTypeInfo("float")); - theMap.put(DOUBLE, TypeInfoFactory.getPrimitiveTypeInfo("double")); - theMap.put(BYTES, TypeInfoFactory.getPrimitiveTypeInfo("binary")); - theMap.put(FIXED, TypeInfoFactory.getPrimitiveTypeInfo("binary")); - theMap.put(STRING, TypeInfoFactory.getPrimitiveTypeInfo("string")); - return Collections.unmodifiableMap(theMap); - } - - /** - * Generate a list of of TypeInfos from an Avro schema. This method is - * currently public due to some weirdness in deserializing unions, but - * will be made private once that is resolved. - * @param schema Schema to generate field types for - * @return List of TypeInfos, each element of which is a TypeInfo derived - * from the schema. - * @throws AvroSerdeException for problems during conversion. - */ - public static List generateColumnTypes(Schema schema) throws AvroSerdeException { - return generateColumnTypes (schema, null); - } - - /** - * Generate a list of of TypeInfos from an Avro schema. This method is - * currently public due to some weirdness in deserializing unions, but - * will be made private once that is resolved. - * @param schema Schema to generate field types for - * @param seenSchemas stores schemas processed in the parsing done so far, - * helping to resolve circular references in the schema - * @return List of TypeInfos, each element of which is a TypeInfo derived - * from the schema. - * @throws AvroSerdeException for problems during conversion. - */ - public static List generateColumnTypes(Schema schema, - Set seenSchemas) throws AvroSerdeException { - List fields = schema.getFields(); - - List types = new ArrayList(fields.size()); - - for (Schema.Field field : fields) { - types.add(generateTypeInfo(field.schema(), seenSchemas)); - } - - return types; - } - - static InstanceCache typeInfoCache = new InstanceCache() { - @Override - protected TypeInfo makeInstance(Schema s, - Set seenSchemas) - throws AvroSerdeException { - return generateTypeInfoWorker(s, seenSchemas); - } - }; - /** - * Convert an Avro Schema into an equivalent Hive TypeInfo. - * @param schema to record. Must be of record type. - * @param seenSchemas stores schemas processed in the parsing done so far, - * helping to resolve circular references in the schema - * @return TypeInfo matching the Avro schema - * @throws AvroSerdeException for any problems during conversion. - */ - public static TypeInfo generateTypeInfo(Schema schema, - Set seenSchemas) throws AvroSerdeException { - // For bytes type, it can be mapped to decimal. - Schema.Type type = schema.getType(); - if (type == BYTES && AvroSerDe.DECIMAL_TYPE_NAME - .equalsIgnoreCase(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE))) { - int precision = 0; - int scale = 0; - try { - precision = schema.getJsonProp(AvroSerDe.AVRO_PROP_PRECISION).getIntValue(); - scale = schema.getJsonProp(AvroSerDe.AVRO_PROP_SCALE).getIntValue(); - } catch (Exception ex) { - throw new AvroSerdeException("Failed to obtain scale value from file schema: " + schema, ex); - } - - try { - HiveDecimalUtils.validateParameter(precision, scale); - } catch (Exception ex) { - throw new AvroSerdeException("Invalid precision or scale for decimal type", ex); - } - - return TypeInfoFactory.getDecimalTypeInfo(precision, scale); - } - - if (type == STRING && - AvroSerDe.CHAR_TYPE_NAME.equalsIgnoreCase(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE))) { - int maxLength = 0; - try { - maxLength = schema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt(); - } catch (Exception ex) { - throw new AvroSerdeException("Failed to obtain maxLength value from file schema: " + schema, ex); - } - return TypeInfoFactory.getCharTypeInfo(maxLength); - } - - if (type == STRING && AvroSerDe.VARCHAR_TYPE_NAME - .equalsIgnoreCase(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE))) { - int maxLength = 0; - try { - maxLength = schema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt(); - } catch (Exception ex) { - throw new AvroSerdeException("Failed to obtain maxLength value from file schema: " + schema, ex); - } - return TypeInfoFactory.getVarcharTypeInfo(maxLength); - } - - if (type == INT && - AvroSerDe.DATE_TYPE_NAME.equals(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE))) { - return TypeInfoFactory.dateTypeInfo; - } - - if (type == LONG && - AvroSerDe.TIMESTAMP_TYPE_NAME.equals(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE))) { - return TypeInfoFactory.timestampTypeInfo; - } - - return typeInfoCache.retrieve(schema, seenSchemas); - } - - private static TypeInfo generateTypeInfoWorker(Schema schema, - Set seenSchemas) throws AvroSerdeException { - // Avro requires NULLable types to be defined as unions of some type T - // and NULL. This is annoying and we're going to hide it from the user. - if(AvroSerdeUtils.isNullableType(schema)) { - return generateTypeInfo( - AvroSerdeUtils.getOtherTypeFromNullableType(schema), seenSchemas); - } - - Schema.Type type = schema.getType(); - if(primitiveTypeToTypeInfo.containsKey(type)) { - return primitiveTypeToTypeInfo.get(type); - } - - switch(type) { - case RECORD: return generateRecordTypeInfo(schema, seenSchemas); - case MAP: return generateMapTypeInfo(schema, seenSchemas); - case ARRAY: return generateArrayTypeInfo(schema, seenSchemas); - case UNION: return generateUnionTypeInfo(schema, seenSchemas); - case ENUM: return generateEnumTypeInfo(schema); - default: throw new AvroSerdeException("Do not yet support: " + schema); - } - } - - private static TypeInfo generateRecordTypeInfo(Schema schema, - Set seenSchemas) throws AvroSerdeException { - assert schema.getType().equals(Schema.Type.RECORD); - - if (seenSchemas == null) { - seenSchemas = Collections.newSetFromMap(new IdentityHashMap()); - } else if (seenSchemas.contains(schema)) { - throw new AvroSerdeException( - "Recursive schemas are not supported. Recursive schema was " + schema - .getFullName()); - } - seenSchemas.add(schema); - - List fields = schema.getFields(); - List fieldNames = new ArrayList(fields.size()); - List typeInfos = new ArrayList(fields.size()); - - for(int i = 0; i < fields.size(); i++) { - fieldNames.add(i, fields.get(i).name()); - typeInfos.add(i, generateTypeInfo(fields.get(i).schema(), seenSchemas)); - } - - return TypeInfoFactory.getStructTypeInfo(fieldNames, typeInfos); - } - - /** - * Generate a TypeInfo for an Avro Map. This is made slightly simpler in that - * Avro only allows maps with strings for keys. - */ - private static TypeInfo generateMapTypeInfo(Schema schema, - Set seenSchemas) throws AvroSerdeException { - assert schema.getType().equals(Schema.Type.MAP); - Schema valueType = schema.getValueType(); - TypeInfo ti = generateTypeInfo(valueType, seenSchemas); - - return TypeInfoFactory.getMapTypeInfo(TypeInfoFactory.getPrimitiveTypeInfo("string"), ti); - } - - private static TypeInfo generateArrayTypeInfo(Schema schema, - Set seenSchemas) throws AvroSerdeException { - assert schema.getType().equals(Schema.Type.ARRAY); - Schema itemsType = schema.getElementType(); - TypeInfo itemsTypeInfo = generateTypeInfo(itemsType, seenSchemas); - - return TypeInfoFactory.getListTypeInfo(itemsTypeInfo); - } - - private static TypeInfo generateUnionTypeInfo(Schema schema, - Set seenSchemas) throws AvroSerdeException { - assert schema.getType().equals(Schema.Type.UNION); - List types = schema.getTypes(); - - - List typeInfos = new ArrayList(types.size()); - - for(Schema type : types) { - typeInfos.add(generateTypeInfo(type, seenSchemas)); - } - - return TypeInfoFactory.getUnionTypeInfo(typeInfos); - } - - // Hive doesn't have an Enum type, so we're going to treat them as Strings. - // During the deserialize/serialize stage we'll check for enumness and - // convert as such. - private static TypeInfo generateEnumTypeInfo(Schema schema) { - assert schema.getType().equals(Schema.Type.ENUM); - - return TypeInfoFactory.getPrimitiveTypeInfo("string"); - } -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java deleted file mode 100644 index 4f8b05f5aeac3099128453c74bdef0f98986888d..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.serde2.avro; - -import org.apache.avro.Schema; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; -import org.codehaus.jackson.JsonNode; -import org.codehaus.jackson.node.JsonNodeFactory; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Convert Hive TypeInfo to an Avro Schema - */ -public class TypeInfoToSchema { - - private long recordCounter = 0; - - /** - * Converts Hive schema to avro schema - * - * @param columnNames Names of the hive columns - * @param columnTypes Hive Column types - * @param namespace Namespace of Avro schema - * @param name Avro schema name - * @param doc Avro schema doc - * @return Avro Schema - */ - public Schema convert(List columnNames, List columnTypes, - List columnComments, String namespace, String name, String doc) { - - List fields = new ArrayList(); - for (int i = 0; i < columnNames.size(); ++i) { - final String comment = columnComments.size() > i ? columnComments.get(i) : null; - final Schema.Field avroField = createAvroField(columnNames.get(i), columnTypes.get(i), - comment); - fields.addAll(getFields(avroField)); - } - - if (name == null || name.isEmpty()) { - name = "baseRecord"; - } - - Schema avroSchema = Schema.createRecord(name, doc, namespace, false); - avroSchema.setFields(fields); - return avroSchema; - } - - private Schema.Field createAvroField(String name, TypeInfo typeInfo, String comment) { - return new Schema.Field(name, createAvroSchema(typeInfo), comment, null); - } - - private Schema createAvroSchema(TypeInfo typeInfo) { - Schema schema = null; - switch (typeInfo.getCategory()) { - case PRIMITIVE: - schema = createAvroPrimitive(typeInfo); - break; - case LIST: - schema = createAvroArray(typeInfo); - break; - case MAP: - schema = createAvroMap(typeInfo); - break; - case STRUCT: - schema = createAvroRecord(typeInfo); - break; - case UNION: - schema = createAvroUnion(typeInfo); - break; - } - - return wrapInUnionWithNull(schema); - } - - private Schema createAvroPrimitive(TypeInfo typeInfo) { - PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo; - Schema schema; - switch (primitiveTypeInfo.getPrimitiveCategory()) { - case STRING: - schema = Schema.create(Schema.Type.STRING); - break; - case CHAR: - schema = AvroSerdeUtils.getSchemaFor("{" + - "\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," + - "\"logicalType\":\"" + AvroSerDe.CHAR_TYPE_NAME + "\"," + - "\"maxLength\":" + ((CharTypeInfo) typeInfo).getLength() + "}"); - break; - case VARCHAR: - schema = AvroSerdeUtils.getSchemaFor("{" + - "\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," + - "\"logicalType\":\"" + AvroSerDe.VARCHAR_TYPE_NAME + "\"," + - "\"maxLength\":" + ((VarcharTypeInfo) typeInfo).getLength() + "}"); - break; - case BINARY: - schema = Schema.create(Schema.Type.BYTES); - break; - case BYTE: - schema = Schema.create(Schema.Type.INT); - break; - case SHORT: - schema = Schema.create(Schema.Type.INT); - break; - case INT: - schema = Schema.create(Schema.Type.INT); - break; - case LONG: - schema = Schema.create(Schema.Type.LONG); - break; - case FLOAT: - schema = Schema.create(Schema.Type.FLOAT); - break; - case DOUBLE: - schema = Schema.create(Schema.Type.DOUBLE); - break; - case BOOLEAN: - schema = Schema.create(Schema.Type.BOOLEAN); - break; - case DECIMAL: - DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; - String precision = String.valueOf(decimalTypeInfo.precision()); - String scale = String.valueOf(decimalTypeInfo.scale()); - schema = AvroSerdeUtils.getSchemaFor("{" + - "\"type\":\"bytes\"," + - "\"logicalType\":\"decimal\"," + - "\"precision\":" + precision + "," + - "\"scale\":" + scale + "}"); - break; - case DATE: - schema = AvroSerdeUtils.getSchemaFor("{" + - "\"type\":\"" + AvroSerDe.AVRO_INT_TYPE_NAME + "\"," + - "\"logicalType\":\"" + AvroSerDe.DATE_TYPE_NAME + "\"}"); - break; - case TIMESTAMP: - schema = AvroSerdeUtils.getSchemaFor("{" + - "\"type\":\"" + AvroSerDe.AVRO_LONG_TYPE_NAME + "\"," + - "\"logicalType\":\"" + AvroSerDe.TIMESTAMP_TYPE_NAME + "\"}"); - break; - case VOID: - schema = Schema.create(Schema.Type.NULL); - break; - default: - throw new UnsupportedOperationException(typeInfo + " is not supported."); - } - return schema; - } - - private Schema createAvroUnion(TypeInfo typeInfo) { - List childSchemas = new ArrayList(); - for (TypeInfo childTypeInfo : ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos()) { - final Schema childSchema = createAvroSchema(childTypeInfo); - if (childSchema.getType() == Schema.Type.UNION) { - childSchemas.addAll(childSchema.getTypes()); - } else { - childSchemas.add(childSchema); - } - } - - return Schema.createUnion(removeDuplicateNullSchemas(childSchemas)); - } - - private Schema createAvroRecord(TypeInfo typeInfo) { - List childFields = new ArrayList(); - - final List allStructFieldNames = - ((StructTypeInfo) typeInfo).getAllStructFieldNames(); - final List allStructFieldTypeInfos = - ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos(); - if (allStructFieldNames.size() != allStructFieldTypeInfos.size()) { - throw new IllegalArgumentException("Failed to generate avro schema from hive schema. " + - "name and column type differs. names = " + allStructFieldNames + ", types = " + - allStructFieldTypeInfos); - } - - for (int i = 0; i < allStructFieldNames.size(); ++i) { - final TypeInfo childTypeInfo = allStructFieldTypeInfos.get(i); - final Schema.Field grandChildSchemaField = createAvroField(allStructFieldNames.get(i), - childTypeInfo, childTypeInfo.toString()); - final List grandChildFields = getFields(grandChildSchemaField); - childFields.addAll(grandChildFields); - } - - Schema recordSchema = Schema.createRecord("record_" + recordCounter, typeInfo.toString(), - null, false); - ++recordCounter; - recordSchema.setFields(childFields); - return recordSchema; - } - - private Schema createAvroMap(TypeInfo typeInfo) { - TypeInfo keyTypeInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo(); - if (((PrimitiveTypeInfo) keyTypeInfo).getPrimitiveCategory() - != PrimitiveObjectInspector.PrimitiveCategory.STRING) { - throw new UnsupportedOperationException("Key of Map can only be a String"); - } - - TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo(); - Schema valueSchema = createAvroSchema(valueTypeInfo); - - return Schema.createMap(valueSchema); - } - - private Schema createAvroArray(TypeInfo typeInfo) { - ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; - Schema listSchema = createAvroSchema(listTypeInfo.getListElementTypeInfo()); - return Schema.createArray(listSchema); - } - - private List getFields(Schema.Field schemaField) { - List fields = new ArrayList(); - - JsonNode nullDefault = JsonNodeFactory.instance.nullNode(); - if (schemaField.schema().getType() == Schema.Type.RECORD) { - for (Schema.Field field : schemaField.schema().getFields()) { - fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), nullDefault)); - } - } else { - fields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), - nullDefault)); - } - - return fields; - } - - private Schema wrapInUnionWithNull(Schema schema) { - Schema wrappedSchema = schema; - switch (schema.getType()) { - case NULL: - break; - case UNION: - List existingSchemas = removeDuplicateNullSchemas(schema.getTypes()); - wrappedSchema = Schema.createUnion(existingSchemas); - break; - default: - wrappedSchema = Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), schema)); - } - - return wrappedSchema; - } - - private List removeDuplicateNullSchemas(List childSchemas) { - List prunedSchemas = new ArrayList(); - boolean isNullPresent = false; - for (Schema schema : childSchemas) { - if (schema.getType() == Schema.Type.NULL) { - isNullPresent = true; - } else { - prunedSchemas.add(schema); - } - } - if (isNullPresent) { - prunedSchemas.add(0, Schema.create(Schema.Type.NULL)); - } - - return prunedSchemas; - } -} \ No newline at end of file diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java deleted file mode 100644 index 99b565dd3fcb14fa1b05408e48a776260d1b47ac..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.objectinspector; - -import org.apache.hadoop.hive.common.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.classification.InterfaceStability; - -/** - * ObjectInspector helps us to look into the internal structure of a complex - * object. - * - * A (probably configured) ObjectInspector instance stands for a specific type - * and a specific way to store the data of that type in the memory. - * - * For native java Object, we can directly access the internal structure through - * member fields and methods. ObjectInspector is a way to delegate that - * functionality away from the Object, so that we have more control on the - * behavior of those actions. - * - * An efficient implementation of ObjectInspector should rely on factory, so - * that we can make sure the same ObjectInspector only has one instance. That - * also makes sure hashCode() and equals() methods of java.lang.Object directly - * works for ObjectInspector as well. - */ -@InterfaceAudience.Public -@InterfaceStability.Stable -public interface ObjectInspector extends Cloneable { - - /** - * Category. - * - */ - public static enum Category { - PRIMITIVE, LIST, MAP, STRUCT, UNION - }; - - /** - * Returns the name of the data type that is inspected by this - * ObjectInspector. This is used to display the type information to the user. - * - * For primitive types, the type name is standardized. For other types, the - * type name can be something like "list<int>", "map<int,string>", java class - * names, or user-defined type names similar to typedef. - */ - String getTypeName(); - - /** - * An ObjectInspector must inherit from one of the following interfaces if - * getCategory() returns: PRIMITIVE: PrimitiveObjectInspector LIST: - * ListObjectInspector MAP: MapObjectInspector STRUCT: StructObjectInspector. - */ - Category getCategory(); -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/BaseCharTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/BaseCharTypeInfo.java index 820fb4ef1dc17479d9e7fea0890d5834b806106d..4ffe50fd95dabdb33c957591c5686c142733ca53 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/BaseCharTypeInfo.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/BaseCharTypeInfo.java @@ -67,4 +67,9 @@ public void setTypeName(String typeName) { // type name should already be set by subclass return; } + + @Override + public Object[] getParameters() { + return new Object[] { length }; + } } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/DecimalTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/DecimalTypeInfo.java index 2e76df51e77f6e2c965deab17e04483d92149660..c5cb32b5eefd877770566de9dd716224e01a7ea4 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/DecimalTypeInfo.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/DecimalTypeInfo.java @@ -127,4 +127,9 @@ public void setScale(int scale) { this.scale = scale; } + @Override + public Object[] getParameters() { + return new Object[] { precision, scale }; + } + } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/ListTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/ListTypeInfo.java deleted file mode 100644 index c632bffeb3cf10e0c87b70342f9541773d638ece..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/ListTypeInfo.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.typeinfo; - -import java.io.Serializable; - -import org.apache.hadoop.hive.common.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.classification.InterfaceStability; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; - -/** - * A List Type has homogeneous elements. All elements of the List has the same - * TypeInfo which is returned by getListElementTypeInfo. - * - * Always use the TypeInfoFactory to create new TypeInfo objects, instead of - * directly creating an instance of this class. - */ -@InterfaceAudience.Public -@InterfaceStability.Stable -public final class ListTypeInfo extends TypeInfo implements Serializable { - - private static final long serialVersionUID = 1L; - private TypeInfo listElementTypeInfo; - - /** - * For java serialization use only. - */ - public ListTypeInfo() { - } - - @Override - public String getTypeName() { - return org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "<" - + listElementTypeInfo.getTypeName() + ">"; - } - - /** - * For java serialization use only. - */ - public void setListElementTypeInfo(TypeInfo listElementTypeInfo) { - this.listElementTypeInfo = listElementTypeInfo; - } - - /** - * For TypeInfoFactory use only. - */ - ListTypeInfo(TypeInfo elementTypeInfo) { - listElementTypeInfo = elementTypeInfo; - } - - @Override - public Category getCategory() { - return Category.LIST; - } - - public TypeInfo getListElementTypeInfo() { - return listElementTypeInfo; - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (!(other instanceof ListTypeInfo)) { - return false; - } - return getListElementTypeInfo().equals( - ((ListTypeInfo) other).getListElementTypeInfo()); - } - - @Override - public int hashCode() { - return listElementTypeInfo.hashCode(); - } - -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/MapTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/MapTypeInfo.java deleted file mode 100644 index 1344c902293d57e26dab6c52d8a454e9610217c0..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/MapTypeInfo.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.typeinfo; - -import java.io.Serializable; - -import org.apache.hadoop.hive.common.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.classification.InterfaceStability; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; - -/** - * A Map Type has homogeneous keys and homogeneous values. All keys of the Map - * have the same TypeInfo, which is returned by getMapKeyTypeInfo(); and all - * values of the Map has the same TypeInfo, which is returned by - * getMapValueTypeInfo(). - * - * Always use the TypeInfoFactory to create new TypeInfo objects, instead of - * directly creating an instance of this class. - */ -@InterfaceAudience.Public -@InterfaceStability.Stable -public final class MapTypeInfo extends TypeInfo implements Serializable { - - private static final long serialVersionUID = 1L; - - private TypeInfo mapKeyTypeInfo; - private TypeInfo mapValueTypeInfo; - - /** - * For java serialization use only. - */ - public MapTypeInfo() { - } - - @Override - public String getTypeName() { - return org.apache.hadoop.hive.serde.serdeConstants.MAP_TYPE_NAME + "<" - + mapKeyTypeInfo.getTypeName() + "," + mapValueTypeInfo.getTypeName() - + ">"; - } - - /** - * For java serialization use only. - */ - public void setMapKeyTypeInfo(TypeInfo mapKeyTypeInfo) { - this.mapKeyTypeInfo = mapKeyTypeInfo; - } - - /** - * For java serialization use only. - */ - public void setMapValueTypeInfo(TypeInfo mapValueTypeInfo) { - this.mapValueTypeInfo = mapValueTypeInfo; - } - - // For TypeInfoFactory use only - MapTypeInfo(TypeInfo keyTypeInfo, TypeInfo valueTypeInfo) { - mapKeyTypeInfo = keyTypeInfo; - mapValueTypeInfo = valueTypeInfo; - } - - @Override - public Category getCategory() { - return Category.MAP; - } - - public TypeInfo getMapKeyTypeInfo() { - return mapKeyTypeInfo; - } - - public TypeInfo getMapValueTypeInfo() { - return mapValueTypeInfo; - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (!(other instanceof MapTypeInfo)) { - return false; - } - MapTypeInfo o = (MapTypeInfo) other; - return o.getMapKeyTypeInfo().equals(getMapKeyTypeInfo()) - && o.getMapValueTypeInfo().equals(getMapValueTypeInfo()); - } - - @Override - public int hashCode() { - return mapKeyTypeInfo.hashCode() ^ mapValueTypeInfo.hashCode(); - } - -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/PrimitiveTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/PrimitiveTypeInfo.java index 97af49a60494991d117ccdf0ea2bfb170a827da3..bfeb2380313ef34b5da0aa59842e87d5a6c18bdf 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/PrimitiveTypeInfo.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/PrimitiveTypeInfo.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.common.classification.InterfaceStability; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; @@ -36,16 +37,14 @@ */ @InterfaceAudience.Public @InterfaceStability.Stable -public class PrimitiveTypeInfo extends TypeInfo implements Serializable { +public class PrimitiveTypeInfo extends MetastorePrimitiveTypeInfo implements Serializable { private static final long serialVersionUID = 1L; - // Base name (varchar vs fully qualified name such as varchar(200)). - protected String typeName; - /** * For java serialization use only. */ public PrimitiveTypeInfo() { + super(); } /** @@ -75,44 +74,7 @@ public PrimitiveCategory getPrimitiveCategory() { return getPrimitiveTypeEntry().primitiveJavaClass; } - // The following 2 methods are for java serialization use only. - public void setTypeName(String typeName) { - this.typeName = typeName; - } - - @Override - public String getTypeName() { - return typeName; - } - public PrimitiveTypeEntry getPrimitiveTypeEntry() { return PrimitiveObjectInspectorUtils.getTypeEntryFromTypeName(typeName); } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (other == null || getClass() != other.getClass()) { - return false; - } - - PrimitiveTypeInfo pti = (PrimitiveTypeInfo) other; - - return this.typeName.equals(pti.typeName); - } - - /** - * Generate the hashCode for this TypeInfo. - */ - @Override - public int hashCode() { - return typeName.hashCode(); - } - - @Override - public String toString() { - return typeName; - } } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/StructTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/StructTypeInfo.java deleted file mode 100644 index 4caedb02d8d765245d068482ba2cb293dad1526a..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/StructTypeInfo.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.typeinfo; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.hadoop.hive.common.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.classification.InterfaceStability; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; - -/** - * StructTypeInfo represents the TypeInfo of a struct. A struct contains one or - * more fields each of which has a unique name and its own TypeInfo. Different - * fields can have the same or different TypeInfo. - * - * Always use the TypeInfoFactory to create new TypeInfo objects, instead of - * directly creating an instance of this class. - */ -@InterfaceAudience.Public -@InterfaceStability.Stable -public final class StructTypeInfo extends TypeInfo implements Serializable { - - private static final long serialVersionUID = 1L; - - private ArrayList allStructFieldNames; - private ArrayList allStructFieldTypeInfos; - - /** - * For java serialization use only. - */ - public StructTypeInfo() { - } - - @Override - public String getTypeName() { - StringBuilder sb = new StringBuilder(); - sb.append(serdeConstants.STRUCT_TYPE_NAME + "<"); - for (int i = 0; i < allStructFieldNames.size(); i++) { - if (i > 0) { - sb.append(","); - } - sb.append(allStructFieldNames.get(i)); - sb.append(":"); - sb.append(allStructFieldTypeInfos.get(i).getTypeName()); - } - sb.append(">"); - return sb.toString(); - } - - /** - * For java serialization use only. - */ - public void setAllStructFieldNames(ArrayList allStructFieldNames) { - this.allStructFieldNames = allStructFieldNames; - } - - /** - * For java serialization use only. - */ - public void setAllStructFieldTypeInfos( - ArrayList allStructFieldTypeInfos) { - this.allStructFieldTypeInfos = allStructFieldTypeInfos; - } - - /** - * For TypeInfoFactory use only. - */ - StructTypeInfo(List names, List typeInfos) { - allStructFieldNames = new ArrayList(names); - allStructFieldTypeInfos = new ArrayList(typeInfos); - } - - @Override - public Category getCategory() { - return Category.STRUCT; - } - - public ArrayList getAllStructFieldNames() { - return allStructFieldNames; - } - - public ArrayList getAllStructFieldTypeInfos() { - return allStructFieldTypeInfos; - } - - public TypeInfo getStructFieldTypeInfo(String field) { - String fieldLowerCase = field.toLowerCase(); - for (int i = 0; i < allStructFieldNames.size(); i++) { - if (fieldLowerCase.equalsIgnoreCase(allStructFieldNames.get(i))) { - return allStructFieldTypeInfos.get(i); - } - } - throw new RuntimeException("cannot find field " + field - + "(lowercase form: " + fieldLowerCase + ") in " + allStructFieldNames); - // return null; - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (!(other instanceof StructTypeInfo)) { - return false; - } - StructTypeInfo o = (StructTypeInfo) other; - Iterator namesIterator = getAllStructFieldNames().iterator(); - Iterator otherNamesIterator = o.getAllStructFieldNames().iterator(); - - // Compare the field names using ignore-case semantics - while (namesIterator.hasNext() && otherNamesIterator.hasNext()) { - if (!namesIterator.next().equalsIgnoreCase(otherNamesIterator.next())) { - return false; - } - } - - // Different number of field names - if (namesIterator.hasNext() || otherNamesIterator.hasNext()) { - return false; - } - - // Compare the field types - return o.getAllStructFieldTypeInfos().equals(getAllStructFieldTypeInfos()); - } - - @Override - public int hashCode() { - return allStructFieldNames.hashCode() ^ allStructFieldTypeInfos.hashCode(); - } - -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TimestampLocalTZTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TimestampLocalTZTypeInfo.java index 6f9eeea02035ef9e06ee4c5f4d8a443eed36ce6c..8b942086db3cd7ae4b0d462eeccb803fd68a7eea 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TimestampLocalTZTypeInfo.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TimestampLocalTZTypeInfo.java @@ -101,4 +101,9 @@ public void setTimeZone(ZoneId timeZone) { this.timeZone = timeZone; } + @Override + public Object[] getParameters() { + return new Object[] { timeZone }; + } + } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java deleted file mode 100644 index 75e09739b9814b1428f94f71516645417981cbc7..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.serde2.typeinfo; - -import java.io.Serializable; - -import org.apache.hadoop.hive.common.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.classification.InterfaceStability; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; - -/** - * Stores information about a type. Always use the TypeInfoFactory to create new - * TypeInfo objects. - * - * We support 8 categories of types: - * 1. Primitive objects (String, Number, etc) - * 2. List objects (a list of objects of a single type) - * 3. Map objects (a map from objects of one type to objects of another type) - * 4. Struct objects (a list of fields with names and their own types) - * 5. Union objects - * 6. Decimal objects - * 7. Char objects - * 8. Varchar objects - */ -@InterfaceAudience.Public -@InterfaceStability.Stable -public abstract class TypeInfo implements Serializable { - - private static final long serialVersionUID = 1L; - - protected TypeInfo() { - } - - /** - * The Category of this TypeInfo. Possible values are Primitive, List, Map, - * Struct and Union, which corresponds to the 5 sub-classes of TypeInfo. - */ - public abstract Category getCategory(); - - /** - * A String representation of the TypeInfo. - */ - public abstract String getTypeName(); - - /** - * String representing the qualified type name. - * Qualified types should override this method. - * @return - */ - public String getQualifiedName() { - return getTypeName(); - } - - @Override - public String toString() { - return getTypeName(); - } - - @Override - public abstract boolean equals(Object o); - - @Override - public abstract int hashCode(); - - public boolean accept(TypeInfo other) { - return this.equals(other); - } - -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java index 77d60c59c219d654d49fdd4853ce33543669c9bc..e730e9d9588b2d1c0aa4c99b9395c50f4f476d7c 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveTypeEntry; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoParser.PrimitiveParts; /** * TypeInfoFactory can be used to create the TypeInfo object for any types. @@ -39,6 +40,53 @@ */ public final class TypeInfoFactory { + //if this singleton looks weird, it is. It is done to support the move of TypeInfo and its sub-classes + //to standalone metastore. Standalone metastore needs access to TypeInfos without depending + //on hive-serde. In order to create these TypeInfo, it needs the TypeInfoParser and TypeInfoFactory. + //We cannot move TypeInfoFactory out of hive code base since its too disruptive. + //ITypeInfoFactory interface in standalone-metastore abstracts out the common functionality + //and metastore implements its own version of TypeInfoFactory. Hive uses its original TypeInfoFactory + //without any changes. The singleton weird-ness is needed to keep the caching behaviour when + //using this TypeInfoFactory in Deserializer and TypeInfoParsers + private static final ITypeInfoFactory instance = new ITypeInfoFactory() { + @Override + public MetastorePrimitiveTypeInfo getPrimitiveTypeInfo(String typeName, Object... parameters) { + if (serdeConstants.DECIMAL_TYPE_NAME.equals(typeName)) { + HiveDecimalUtils.validateParameter((Integer) parameters[0], (Integer) parameters[1]); + } else if (serdeConstants.CHAR_TYPE_NAME.equals(typeName)) { + BaseCharUtils.validateCharParameter((Integer) parameters[0]); + } else if (serdeConstants.VARCHAR_TYPE_NAME.equals(typeName)) { + BaseCharUtils.validateVarcharParameter((Integer)parameters[0]); + } + return TypeInfoFactory.getPrimitiveTypeInfo( + MetastoreTypeInfoUtils.getQualifiedPrimitiveTypeName(typeName, parameters)); + } + + @Override + public MapTypeInfo getMapTypeInfo(TypeInfo keyTypeInfo, TypeInfo valueTypeInfo) { + return (MapTypeInfo) TypeInfoFactory.getMapTypeInfo(keyTypeInfo, valueTypeInfo); + } + + @Override + public ListTypeInfo getListTypeInfo(TypeInfo listElementTypeInfo) { + return (ListTypeInfo) TypeInfoFactory.getListTypeInfo(listElementTypeInfo); + } + + @Override + public UnionTypeInfo getUnionTypeInfo(List typeInfos) { + return (UnionTypeInfo) TypeInfoFactory.getUnionTypeInfo(typeInfos); + } + + @Override + public StructTypeInfo getStructTypeInfo(List names, List typeInfos) { + return (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(names, typeInfos); + } + }; + + public static final ITypeInfoFactory getInstance() { + return instance; + } + private TypeInfoFactory() { // prevent instantiation } @@ -134,14 +182,14 @@ public static PrimitiveTypeInfo getPrimitiveTypeInfo(String typeName) { * @return PrimitiveTypeInfo instance */ private static PrimitiveTypeInfo createPrimitiveTypeInfo(String fullName) { - String baseName = TypeInfoUtils.getBaseName(fullName); + String baseName = MetastoreTypeInfoUtils.getBaseName(fullName); PrimitiveTypeEntry typeEntry = PrimitiveObjectInspectorUtils.getTypeEntryFromTypeName(baseName); if (null == typeEntry) { throw new RuntimeException("Unknown type " + fullName); } - TypeInfoUtils.PrimitiveParts parts = TypeInfoUtils.parsePrimitiveParts(fullName); + PrimitiveParts parts = TypeInfoUtils.parsePrimitiveParts(fullName); if (parts.typeParams == null || parts.typeParams.length < 1) { return null; } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java index f3b19f079173ec2f347495346708dc0ca21f70a3..65ce1328bda77a4edf3f78d22c014c274feb8170 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java @@ -25,14 +25,11 @@ import java.util.ArrayList; import java.util.EnumMap; import java.util.HashMap; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -47,7 +44,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveTypeEntry; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoParser.PrimitiveParts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -240,15 +237,6 @@ public static boolean hasParameters(String typeName) { } } - public static String getBaseName(String typeName) { - int idx = typeName.indexOf('('); - if (idx == -1) { - return typeName; - } else { - return typeName.substring(0, idx); - } - } - /** * returns true if both TypeInfos are of primitive type, and the primitive category matches. * @param ti1 @@ -256,7 +244,7 @@ public static String getBaseName(String typeName) { * @return */ public static boolean doPrimitiveCategoriesMatch(TypeInfo ti1, TypeInfo ti2) { - if (ti1.getCategory() == Category.PRIMITIVE && ti2.getCategory() == Category.PRIMITIVE) { + if (ti1.getCategory() == ObjectInspector.Category.PRIMITIVE && ti2.getCategory() == ObjectInspector.Category.PRIMITIVE) { if (((PrimitiveTypeInfo)ti1).getPrimitiveCategory() == ((PrimitiveTypeInfo)ti2).getPrimitiveCategory()) { return true; @@ -266,328 +254,10 @@ public static boolean doPrimitiveCategoriesMatch(TypeInfo ti1, TypeInfo ti2) { } /** - * Parse a recursive TypeInfo list String. For example, the following inputs - * are valid inputs: - * "int,string,map,list>>,list>" - * The separators between TypeInfos can be ",", ":", or ";". - * - * In order to use this class: TypeInfoParser parser = new - * TypeInfoParser("int,string"); ArrayList typeInfos = - * parser.parseTypeInfos(); - */ - private static class TypeInfoParser { - - private static class Token { - public int position; - public String text; - public boolean isType; - - @Override - public String toString() { - return "" + position + ":" + text; - } - }; - - private static boolean isTypeChar(char c) { - return Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == ' ' || c == '$'; - } - - /** - * Tokenize the typeInfoString. The rule is simple: all consecutive - * alphadigits and '_', '.' are in one token, and all other characters are - * one character per token. - * - * tokenize("map") should return - * ["map","<","int",",","string",">"] - * - * Note that we add '$' in new Calcite return path. As '$' will not appear - * in any type in Hive, it is safe to do so. - */ - private static ArrayList tokenize(String typeInfoString) { - ArrayList tokens = new ArrayList(0); - int begin = 0; - int end = 1; - while (end <= typeInfoString.length()) { - // last character ends a token? - // if there are quotes, all the text between the quotes - // is considered a single token (this can happen for - // timestamp with local time-zone) - if (begin > 0 && - typeInfoString.charAt(begin - 1) == '(' && - typeInfoString.charAt(begin) == '\'') { - // Ignore starting quote - begin++; - do { - end++; - } while (typeInfoString.charAt(end) != '\''); - } else if (typeInfoString.charAt(begin) == '\'' && - typeInfoString.charAt(begin + 1) == ')') { - // Ignore closing quote - begin++; - end++; - } - if (end == typeInfoString.length() - || !isTypeChar(typeInfoString.charAt(end - 1)) - || !isTypeChar(typeInfoString.charAt(end))) { - Token t = new Token(); - t.position = begin; - t.text = typeInfoString.substring(begin, end); - t.isType = isTypeChar(typeInfoString.charAt(begin)); - tokens.add(t); - begin = end; - } - end++; - } - return tokens; - } - - public TypeInfoParser(String typeInfoString) { - this.typeInfoString = typeInfoString; - typeInfoTokens = tokenize(typeInfoString); - } - - private final String typeInfoString; - private final ArrayList typeInfoTokens; - private ArrayList typeInfos; - private int iToken; - - public ArrayList parseTypeInfos() { - typeInfos = new ArrayList(); - iToken = 0; - while (iToken < typeInfoTokens.size()) { - typeInfos.add(parseType()); - if (iToken < typeInfoTokens.size()) { - Token separator = typeInfoTokens.get(iToken); - if (",".equals(separator.text) || ";".equals(separator.text) - || ":".equals(separator.text)) { - iToken++; - } else { - throw new IllegalArgumentException( - "Error: ',', ':', or ';' expected at position " - + separator.position + " from '" + typeInfoString + "' " - + typeInfoTokens); - } - } - } - return typeInfos; - } - - private Token peek() { - if (iToken < typeInfoTokens.size()) { - return typeInfoTokens.get(iToken); - } else { - return null; - } - } - - private Token expect(String item) { - return expect(item, null); - } - - private Token expect(String item, String alternative) { - if (iToken >= typeInfoTokens.size()) { - throw new IllegalArgumentException("Error: " + item - + " expected at the end of '" + typeInfoString + "'"); - } - Token t = typeInfoTokens.get(iToken); - if (item.equals("type")) { - if (!serdeConstants.LIST_TYPE_NAME.equals(t.text) - && !serdeConstants.MAP_TYPE_NAME.equals(t.text) - && !serdeConstants.STRUCT_TYPE_NAME.equals(t.text) - && !serdeConstants.UNION_TYPE_NAME.equals(t.text) - && null == PrimitiveObjectInspectorUtils - .getTypeEntryFromTypeName(t.text) - && !t.text.equals(alternative)) { - throw new IllegalArgumentException("Error: " + item - + " expected at the position " + t.position + " of '" - + typeInfoString + "' but '" + t.text + "' is found."); - } - } else if (item.equals("name")) { - if (!t.isType && !t.text.equals(alternative)) { - throw new IllegalArgumentException("Error: " + item - + " expected at the position " + t.position + " of '" - + typeInfoString + "' but '" + t.text + "' is found."); - } - } else { - if (!item.equals(t.text) && !t.text.equals(alternative)) { - throw new IllegalArgumentException("Error: " + item - + " expected at the position " + t.position + " of '" - + typeInfoString + "' but '" + t.text + "' is found."); - } - } - iToken++; - return t; - } - - private String[] parseParams() { - List params = new LinkedList(); - - Token t = peek(); - if (t != null && t.text.equals("(")) { - expect("("); - - // checking for null in the for-loop condition prevents null-ptr exception - // and allows us to fail more gracefully with a parsing error. - for(t = peek(); (t == null) || !t.text.equals(")"); t = expect(",",")")) { - params.add(expect("name").text); - } - if (params.size() == 0) { - throw new IllegalArgumentException( - "type parameters expected for type string " + typeInfoString); - } - } - - return params.toArray(new String[params.size()]); - } - - private TypeInfo parseType() { - - Token t = expect("type"); - - // Is this a primitive type? - PrimitiveTypeEntry typeEntry = - PrimitiveObjectInspectorUtils.getTypeEntryFromTypeName(t.text); - if (typeEntry != null && typeEntry.primitiveCategory != PrimitiveCategory.UNKNOWN ) { - String[] params = parseParams(); - switch (typeEntry.primitiveCategory) { - case CHAR: - case VARCHAR: - if (params == null || params.length == 0) { - throw new IllegalArgumentException(typeEntry.typeName - + " type is specified without length: " + typeInfoString); - } - - int length = 1; - if (params.length == 1) { - length = Integer.parseInt(params[0]); - if (typeEntry.primitiveCategory == PrimitiveCategory.VARCHAR) { - BaseCharUtils.validateVarcharParameter(length); - return TypeInfoFactory.getVarcharTypeInfo(length); - } else { - BaseCharUtils.validateCharParameter(length); - return TypeInfoFactory.getCharTypeInfo(length); - } - } else if (params.length > 1) { - throw new IllegalArgumentException( - "Type " + typeEntry.typeName+ " only takes one parameter, but " + - params.length + " is seen"); - } - - case DECIMAL: - int precision = HiveDecimal.USER_DEFAULT_PRECISION; - int scale = HiveDecimal.USER_DEFAULT_SCALE; - if (params == null || params.length == 0) { - // It's possible that old metadata still refers to "decimal" as a column type w/o - // precision/scale. In this case, the default (10,0) is assumed. Thus, do nothing here. - } else if (params.length == 1) { - // only precision is specified - precision = Integer.valueOf(params[0]); - HiveDecimalUtils.validateParameter(precision, scale); - } else if (params.length == 2) { - // New metadata always have two parameters. - precision = Integer.parseInt(params[0]); - scale = Integer.parseInt(params[1]); - HiveDecimalUtils.validateParameter(precision, scale); - } else if (params.length > 2) { - throw new IllegalArgumentException("Type decimal only takes two parameter, but " + - params.length + " is seen"); - } - return TypeInfoFactory.getDecimalTypeInfo(precision, scale); - - default: - return TypeInfoFactory.getPrimitiveTypeInfo(typeEntry.typeName); - } - } - - // Is this a list type? - if (serdeConstants.LIST_TYPE_NAME.equals(t.text)) { - expect("<"); - TypeInfo listElementType = parseType(); - expect(">"); - return TypeInfoFactory.getListTypeInfo(listElementType); - } - - // Is this a map type? - if (serdeConstants.MAP_TYPE_NAME.equals(t.text)) { - expect("<"); - TypeInfo mapKeyType = parseType(); - expect(","); - TypeInfo mapValueType = parseType(); - expect(">"); - return TypeInfoFactory.getMapTypeInfo(mapKeyType, mapValueType); - } - - // Is this a struct type? - if (serdeConstants.STRUCT_TYPE_NAME.equals(t.text)) { - ArrayList fieldNames = new ArrayList(); - ArrayList fieldTypeInfos = new ArrayList(); - boolean first = true; - do { - if (first) { - expect("<"); - first = false; - } else { - Token separator = expect(">", ","); - if (separator.text.equals(">")) { - // end of struct - break; - } - } - Token name = expect("name",">"); - if (name.text.equals(">")) { - break; - } - fieldNames.add(name.text); - expect(":"); - fieldTypeInfos.add(parseType()); - } while (true); - - return TypeInfoFactory.getStructTypeInfo(fieldNames, fieldTypeInfos); - } - // Is this a union type? - if (serdeConstants.UNION_TYPE_NAME.equals(t.text)) { - List objectTypeInfos = new ArrayList(); - boolean first = true; - do { - if (first) { - expect("<"); - first = false; - } else { - Token separator = expect(">", ","); - if (separator.text.equals(">")) { - // end of union - break; - } - } - objectTypeInfos.add(parseType()); - } while (true); - - return TypeInfoFactory.getUnionTypeInfo(objectTypeInfos); - } - - throw new RuntimeException("Internal error parsing position " - + t.position + " of '" + typeInfoString + "'"); - } - - public PrimitiveParts parsePrimitiveParts() { - PrimitiveParts parts = new PrimitiveParts(); - Token t = expect("type"); - parts.typeName = t.text; - parts.typeParams = parseParams(); - return parts; - } - } - - public static class PrimitiveParts { - public String typeName; - public String[] typeParams; - } - - /** * Make some of the TypeInfo parsing available as a utility. */ public static PrimitiveParts parsePrimitiveParts(String typeInfoString) { - TypeInfoParser parser = new TypeInfoParser(typeInfoString); + TypeInfoParser parser = new TypeInfoParser(typeInfoString, TypeInfoFactory.getInstance()); return parser.parsePrimitiveParts(); } @@ -844,7 +514,7 @@ public static TypeInfo getTypeInfoFromObjectInspector(ObjectInspector oi) { } public static ArrayList getTypeInfosFromTypeString(String typeString) { - TypeInfoParser parser = new TypeInfoParser(typeString); + TypeInfoParser parser = new TypeInfoParser(typeString, TypeInfoFactory.getInstance()); return parser.parseTypeInfos(); } @@ -861,7 +531,7 @@ public static TypeInfo getTypeInfoFromObjectInspector(ObjectInspector oi) { } public static TypeInfo getTypeInfoFromTypeString(String typeString) { - TypeInfoParser parser = new TypeInfoParser(typeString); + TypeInfoParser parser = new TypeInfoParser(typeString, TypeInfoFactory.getInstance()); return parser.parseTypeInfos().get(0); } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/UnionTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/UnionTypeInfo.java deleted file mode 100644 index 842997c021cb16e7695294bf3154028fa7f53068..0000000000000000000000000000000000000000 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/UnionTypeInfo.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.serde2.typeinfo; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.hive.common.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.classification.InterfaceStability; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; - -/** - * UnionTypeInfo represents the TypeInfo of an union. A union holds only one - * field of the specified fields at any point of time. The fields, a Union can - * hold, can have the same or different TypeInfo. - * - * Always use the TypeInfoFactory to create new TypeInfo objects, instead of - * directly creating an instance of this class. - */ -@InterfaceAudience.Public -@InterfaceStability.Stable -public class UnionTypeInfo extends TypeInfo implements Serializable { - - private static final long serialVersionUID = 1L; - - private List allUnionObjectTypeInfos; - - /** - * For java serialization use only. - */ - public UnionTypeInfo() { - } - - @Override - public String getTypeName() { - StringBuilder sb = new StringBuilder(); - sb.append(serdeConstants.UNION_TYPE_NAME + "<"); - for (int i = 0; i < allUnionObjectTypeInfos.size(); i++) { - if (i > 0) { - sb.append(","); - } - sb.append(allUnionObjectTypeInfos.get(i).getTypeName()); - } - sb.append(">"); - return sb.toString(); - } - - /** - * For java serialization use only. - */ - public void setAllUnionObjectTypeInfos( - List allUnionObjectTypeInfos) { - this.allUnionObjectTypeInfos = allUnionObjectTypeInfos; - } - - /** - * For TypeInfoFactory use only. - */ - UnionTypeInfo(List typeInfos) { - allUnionObjectTypeInfos = new ArrayList(); - allUnionObjectTypeInfos.addAll(typeInfos); - } - - @Override - public Category getCategory() { - return Category.UNION; - } - - public List getAllUnionObjectTypeInfos() { - return allUnionObjectTypeInfos; - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (!(other instanceof UnionTypeInfo)) { - return false; - } - UnionTypeInfo o = (UnionTypeInfo) other; - - // Compare the field types - return o.getAllUnionObjectTypeInfos().equals(getAllUnionObjectTypeInfos()); - } - - @Override - public int hashCode() { - return allUnionObjectTypeInfos.hashCode(); - } -} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/VarcharTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/VarcharTypeInfo.java index edf12a20005896e4e12d0ea6171760a76ee3fe53..a6c248a755becba9469c3ccfd2b3ba841d856144 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/VarcharTypeInfo.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/VarcharTypeInfo.java @@ -51,7 +51,6 @@ public boolean equals(Object other) { return this.getLength() == pti.getLength(); } - /** * Generate the hashCode for this TypeInfo. */ diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestInstanceCache.java b/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestInstanceCache.java deleted file mode 100644 index cb7c6ed42158727ab5dd32ecb1f5f7d727b87ba3..0000000000000000000000000000000000000000 --- a/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestInstanceCache.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.serde2.avro; - -import java.util.Set; -import org.junit.Test; - -import static org.junit.Assert.assertSame; - -public class TestInstanceCache { - private static class Foo { - - private int value = 42; - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Foo foo = (Foo) o; - - return value == foo.value; - - } - - @Override - public int hashCode() { - return value; - } - } - - private static class Wrapper { - public final T wrapped; - - private Wrapper(T wrapped) { - this.wrapped = wrapped; - } - } - - @Test - public void instanceCachesOnlyCreateOneInstance() throws AvroSerdeException { - InstanceCache> ic = new InstanceCache>() { - @Override - protected Wrapper makeInstance(Foo hv, - Set seenSchemas) { - return new Wrapper(hv); - } - }; - Foo f1 = new Foo(); - - Wrapper fc = ic.retrieve(f1, null); - assertSame(f1, fc.wrapped); // Our original foo should be in the wrapper - - Foo f2 = new Foo(); // Different instance, same value - - Wrapper fc2 = ic.retrieve(f2, null); - assertSame(fc2,fc); // Since equiv f, should get back first container - assertSame(fc2.wrapped, f1); - } - - @Test - public void instanceCacheReturnsCorrectInstances() throws AvroSerdeException { - InstanceCache> ic = new InstanceCache>() { - @Override - protected Wrapper makeInstance( - String hv, Set seenSchemas) { - return new Wrapper(hv); - } - }; - - Wrapper one = ic.retrieve("one", null); - Wrapper two = ic.retrieve("two", null); - - Wrapper anotherOne = ic.retrieve("one", null); - assertSame(one, anotherOne); - - Wrapper anotherTwo = ic.retrieve("two", null); - assertSame(two, anotherTwo); - } -} diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestSchemaToTypeInfo.java b/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestSchemaToTypeInfo.java index af258c7d174e78458cf8caaa0a58ae903d44c3a8..7e928350d48a1105c6ce7f8b8fa1bf6b4b98c6d5 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestSchemaToTypeInfo.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestSchemaToTypeInfo.java @@ -33,7 +33,7 @@ @Test public void testDisallowRecursiveSchema() - throws AvroSerdeException { + throws Exception { expect.expect(AvroSerdeException.class); expect.expectMessage("Recursive schemas are not supported"); @@ -49,6 +49,7 @@ public void testDisallowRecursiveSchema() + " } ]\n" + "}"; - List types = SchemaToTypeInfo.generateColumnTypes(new Schema.Parser().parse(schemaString)); + List types = SchemaToHiveTypeInfo.getInstance() + .generateColumnTypes(new Schema.Parser().parse(schemaString)); } } \ No newline at end of file diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/AvroStorageSchemaReader.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/AvroStorageSchemaReader.java new file mode 100644 index 0000000000000000000000000000000000000000..c6c30294787f69c7c0a38aa563811c4c5d2ac1f2 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/AvroStorageSchemaReader.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.EnvironmentContext; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.utils.AvroSchemaUtils; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.Properties; + +public class AvroStorageSchemaReader implements StorageSchemaReader { + private static final Logger LOG = LoggerFactory.getLogger(AvroStorageSchemaReader.class); + + @Override + public List readSchema(Table tbl, EnvironmentContext envContext, + Configuration conf) throws MetaException { + Properties tblMetadataProperties = MetaStoreUtils.getTableMetadata(tbl); + try { + return AvroSchemaUtils.getFieldsFromAvroSchema(conf, tblMetadataProperties); + } catch (Exception e) { + LOG.warn("Received IOException while reading avro schema for table " + tbl.getTableName(), e); + throw new MetaException(e.getMessage()); + } + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java index d5dea4dc3ca6a83a863326e1c75e2480898d00af..ab8590e1edc1ed14baf6bce086450bc1971f9831 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java @@ -72,6 +72,8 @@ public static final String TIMESTAMPTZ_TYPE_NAME = "timestamp with time zone"; + public static final String TIMESTAMPLOCALTZ_TYPE_NAME = "timestamp with local time zone"; + public static final String LIST_TYPE_NAME = "array"; public static final String MAP_TYPE_NAME = "map"; @@ -105,7 +107,8 @@ INTERVAL_DAY_TIME_TYPE_NAME, DECIMAL_TYPE_NAME, BINARY_TYPE_NAME, - TIMESTAMPTZ_TYPE_NAME); + TIMESTAMPTZ_TYPE_NAME, + TIMESTAMPLOCALTZ_TYPE_NAME); public static final Set StringTypes = StringUtils.asSet( STRING_TYPE_NAME, diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/DefaultStorageSchemaReader.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/DefaultStorageSchemaReader.java index 1dbfa4272cd5368242d335fbde564d35cac26bba..65c2af6fc7b3bd219193823feef195bd2b27121c 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/DefaultStorageSchemaReader.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/DefaultStorageSchemaReader.java @@ -22,17 +22,121 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.utils.AvroSchemaUtils; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.apache.hadoop.hive.metastore.utils.StorageSchemaUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.List; +import java.util.Properties; + +import static org.apache.hadoop.hive.metastore.utils.AvroSchemaUtils.LIST_COLUMN_COMMENTS; /** * Default StorageSchemaReader. This just throws as the metastore currently doesn't know how to * read schemas from storage. */ public class DefaultStorageSchemaReader implements StorageSchemaReader { + private final static Logger LOG = LoggerFactory.getLogger(DefaultStorageSchemaReader.class); + + private static final String AVRO_SERIALIZATION_LIB = + "org.apache.hadoop.hive.serde2.avro.AvroSerDe"; + @Override public List readSchema(Table tbl, EnvironmentContext envContext, - Configuration conf) throws MetaException { - throw new UnsupportedOperationException("Storage schema reading not supported"); + Configuration conf) throws MetaException { + String serializationLib = tbl.getSd().getSerdeInfo().getSerializationLib(); + if (null == serializationLib || MetastoreConf + .getStringCollection(conf, MetastoreConf.ConfVars.SERDES_USING_METASTORE_FOR_SCHEMA) + .contains(serializationLib)) { + //safety check to make sure we should be using storage schema reader for this table + throw new MetaException( + "Invalid usage of default storage schema reader for table " + tbl.getTableName() + + " with storage descriptor " + tbl.getSd().getSerdeInfo().getSerializationLib()); + } + Properties tblMetadataProperties = MetaStoreUtils.getTableMetadata(tbl); + if(AVRO_SERIALIZATION_LIB.equals(serializationLib)) { + //in case of avro table use AvroStorageSchemaReader utils + try { + return AvroSchemaUtils.getFieldsFromAvroSchema(conf, tblMetadataProperties); + } catch (Exception e) { + LOG.warn("Exception received while reading avro schema for table " + tbl.getTableName(), e); + throw new MetaException(e.getMessage()); + } + } else { + return getFieldSchemasFromTableMetadata(tblMetadataProperties); + } + } + + /** + * This method implements a generic way to get the FieldSchemas from the table metadata + * properties like column names and column types. Most of the serdes have the same implemention + * in their initialize method + * //TODO refactor the common code from the serdes and move it to serde-api so that there is no + * //duplicate code + * + * @return list of FieldSchema objects + */ + public static List getFieldSchemasFromTableMetadata( + Properties tblMetadataProperties) { + List columnNames = null; + List columnTypes = null; + // Get column names and types + String columnNameProperty = tblMetadataProperties.getProperty( ColumnType.LIST_COLUMNS); + String columnTypeProperty = tblMetadataProperties.getProperty( ColumnType.LIST_COLUMN_TYPES); + final String columnNameDelimiter = tblMetadataProperties + .containsKey( ColumnType.COLUMN_NAME_DELIMITER) ? tblMetadataProperties + .getProperty( ColumnType.COLUMN_NAME_DELIMITER) : String + .valueOf(StorageSchemaUtils.COMMA); + // all table column names + if (columnNameProperty.isEmpty()) { + columnNames = Collections.emptyList(); + } else { + columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); + } + + // all column types + if (columnTypeProperty.isEmpty()) { + columnTypes = Collections.emptyList(); + } else { + columnTypes = StorageSchemaUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + final String columnCommentProperty = + tblMetadataProperties.getProperty(LIST_COLUMN_COMMENTS, ""); + List columnComments = null; + if (columnCommentProperty == null || columnCommentProperty.isEmpty()) { + columnComments = new ArrayList<>(0); + } else { + columnComments = Arrays.asList( + columnCommentProperty.split(String.valueOf(ColumnType.COLUMN_COMMENTS_DELIMITER))); + } + LOG.debug("columns: {}, {}", columnNameProperty, columnNames); + LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes); + LOG.debug("comments: {} ", columnCommentProperty); + return getFieldSchemaFromColumnInfo(columnNames, columnTypes, columnComments); + } + + private static List getFieldSchemaFromColumnInfo(List columnNames, + List columnTypes, List columnComments) { + int len = columnNames.size(); + List fieldSchemas = new ArrayList<>(len); + for (int i = 0; i < len; i++) { + FieldSchema fieldSchema = new FieldSchema(); + fieldSchema.setName(columnNames.get(i)); + //In case of complex types getTypeName() will recusively go into typeName + //of individual fields when the ColumnType was constructed + //in SchemaToTypeInfo.generateColumnTypes in the constructor + fieldSchema.setType(columnTypes.get(i).getTypeName()); + fieldSchema.setComment(StorageSchemaUtils.determineFieldComment(columnComments.get(i))); + } + return fieldSchemas; } } diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/StorageSchemaReader.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/StorageSchemaReader.java index 6251e23991b8f898f939f5848fc5cbf5e8ceb07c..009c9295dfc23596f4361f37facdb5c9dc9ee75c 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/StorageSchemaReader.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/StorageSchemaReader.java @@ -32,7 +32,7 @@ */ @InterfaceAudience.Public @InterfaceStability.Evolving -interface StorageSchemaReader { +public interface StorageSchemaReader { /** * Read the schema from the storage representation of the table. * @param tbl metastore table object diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/AvroFieldSchemaGenerator.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/AvroFieldSchemaGenerator.java new file mode 100644 index 0000000000000000000000000000000000000000..b1261d8ee9a6746d66432198104cc0608dfde94c --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/AvroFieldSchemaGenerator.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore.utils; + +import org.apache.avro.Schema; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.serde2.avro.SchemaToMetastoreTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.util.ArrayList; +import java.util.List; + +public class AvroFieldSchemaGenerator { + final private List columnNames; + final private List columnTypes; + final private List columnComments; + + public AvroFieldSchemaGenerator(Schema schema) throws Exception { + verifySchemaIsARecord(schema); + + this.columnNames = generateColumnNames(schema); + this.columnTypes = SchemaToMetastoreTypeInfo.getInstance().generateColumnTypes(schema); + this.columnComments = generateColumnComments(schema); + assert columnNames.size() == columnTypes.size(); + } + + private static void verifySchemaIsARecord(Schema schema) throws Exception { + if(!schema.getType().equals(Schema.Type.RECORD)) { + throw new Exception("Schema for table must be of type RECORD. " + + "Received type: " + schema.getType()); + } + } + + private static List generateColumnNames(Schema schema) { + List fields = schema.getFields(); + List fieldsList = new ArrayList(fields.size()); + + for (Schema.Field field : fields) { + fieldsList.add(field.name()); + } + + return fieldsList; + } + + private static List generateColumnComments(Schema schema) { + List fields = schema.getFields(); + List fieldComments = new ArrayList(fields.size()); + + for (Schema.Field field : fields) { + String fieldComment = field.doc() == null ? "" : field.doc(); + fieldComments.add(fieldComment); + } + + return fieldComments; + } + + public List getFieldSchemas() throws Exception { + int len = columnNames.size(); + List fieldSchemas = new ArrayList<>(len); + for(int i = 0; i getFieldsFromAvroSchema(Configuration configuration, + Properties properties) throws Exception { + // Reset member variables so we don't get in a half-constructed state + Schema schema = null; + List columnNames = null; + List columnTypes = null; + + final String columnNameProperty = properties.getProperty(ColumnType.LIST_COLUMNS); + final String columnTypeProperty = properties.getProperty(ColumnType.LIST_COLUMN_TYPES); + final String columnCommentProperty = properties.getProperty(LIST_COLUMN_COMMENTS,""); + final String columnNameDelimiter = properties.containsKey(ColumnType.COLUMN_NAME_DELIMITER) ? properties + .getProperty(ColumnType.COLUMN_NAME_DELIMITER) : String.valueOf(COMMA); + + if (hasExternalSchema(properties) + || columnNameProperty == null || columnNameProperty.isEmpty() + || columnTypeProperty == null || columnTypeProperty.isEmpty()) { + schema = AvroSchemaUtils.determineSchemaOrThrowException(configuration, properties); + } else { + // Get column names and sort order + columnNames = StringUtils.intern( + Arrays.asList(columnNameProperty.split(columnNameDelimiter))); + columnTypes = new TypeInfoParser(columnTypeProperty, MetastoreTypeInfoFactory + .getInstance()).parseTypeInfos(); + + schema = getSchemaFromCols(properties, columnNames, columnTypes, columnCommentProperty); + properties.setProperty(AvroTableProperties.SCHEMA_LITERAL.getPropName(), schema.toString()); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Avro schema is " + schema); + } + + if (configuration == null) { + LOG.debug("Configuration null, not inserting schema"); + } else { + configuration.set( + AvroTableProperties.AVRO_SERDE_SCHEMA.getPropName(), schema.toString(false)); + } + return new AvroFieldSchemaGenerator(schema).getFieldSchemas(); + } + + + private static boolean hasExternalSchema(Properties properties) { + return properties.getProperty(AvroTableProperties.SCHEMA_LITERAL.getPropName()) != null + || properties.getProperty(AvroTableProperties.SCHEMA_URL.getPropName()) != null; + } + + public static boolean supportedCategories(TypeInfo ti) { + final Category c = ti.getCategory(); + return c.equals(Category.PRIMITIVE) || + c.equals(Category.MAP) || + c.equals(Category.LIST) || + c.equals(Category.STRUCT) || + c.equals(Category.UNION); + } + + /** + * Attempt to determine the schema via the usual means, but do not throw + * an exception if we fail. Instead, signal failure via a special + * schema. + */ + public static Schema determineSchemaOrReturnErrorSchema(Configuration conf, Properties props) { + try { + return AvroSchemaUtils.determineSchemaOrThrowException(conf, props); + } catch (Exception e) { + LOG.warn("Encountered exception determining schema. Returning signal " + + "schema to indicate problem", e); + } + return SchemaResolutionProblem.SIGNAL_BAD_SCHEMA; + } + + /** + * Determine the schema to that's been provided for Avro serde work. + * @param properties containing a key pointing to the schema, one way or another + * @return schema to use while serdeing the avro file + */ + public static Schema determineSchemaOrThrowException(Configuration conf, Properties properties) + throws Exception { + String schemaString = properties.getProperty(AvroTableProperties.SCHEMA_LITERAL.getPropName()); + if(schemaString != null && !schemaString.equals(SCHEMA_NONE)) + return AvroSchemaUtils.getSchemaFor(schemaString); + + // Try pulling directly from URL + schemaString = properties.getProperty(AvroTableProperties.SCHEMA_URL.getPropName()); + if (schemaString == null) { + final String columnNameProperty = properties.getProperty(ColumnType.LIST_COLUMNS); + final String columnTypeProperty = properties.getProperty(ColumnType.LIST_COLUMN_TYPES); + final String columnCommentProperty = properties.getProperty(LIST_COLUMN_COMMENTS); + if (columnNameProperty == null || columnNameProperty.isEmpty() + || columnTypeProperty == null || columnTypeProperty.isEmpty() ) { + throw new IOException(EXCEPTION_MESSAGE); + } + final String columnNameDelimiter = properties.containsKey(ColumnType.COLUMN_NAME_DELIMITER) ? properties + .getProperty(ColumnType.COLUMN_NAME_DELIMITER) : String.valueOf(COMMA); + // Get column names and types + List columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); + List columnTypes = + new TypeInfoParser(columnTypeProperty, + MetastoreTypeInfoFactory.getInstance()).parseTypeInfos(); + //TODO Why can't we directly bypass this whole logic and use ColumnTypeInfo to use + //AvroFieldSchemaGenerator directly? + Schema schema = getSchemaFromCols(properties, columnNames, columnTypes, columnCommentProperty); + properties.setProperty(AvroTableProperties.SCHEMA_LITERAL.getPropName(), schema.toString()); + if (conf != null) + conf.set(AvroTableProperties.AVRO_SERDE_SCHEMA.getPropName(), schema.toString(false)); + return schema; + } else if(schemaString.equals(SCHEMA_NONE)) { + throw new Exception(EXCEPTION_MESSAGE); + } + + try { + Schema s = getSchemaFromFS(schemaString, conf); + if (s == null) { + //in case schema is not a file system + return AvroSchemaUtils.getSchemaFor(new URL(schemaString)); + } + return s; + } catch (IOException ioe) { + throw new Exception("Unable to read schema from given path: " + schemaString, ioe); + } catch (URISyntaxException urie) { + throw new Exception("Unable to read schema from given path: " + schemaString, urie); + } + } + + // Protected for testing and so we can pass in a conf for testing. + protected static Schema getSchemaFromFS(String schemaFSUrl, + Configuration conf) throws IOException, URISyntaxException { + FSDataInputStream in = null; + FileSystem fs = null; + try { + fs = FileSystem.get(new URI(schemaFSUrl), conf); + } catch (IOException ioe) { + //return null only if the file system in schema is not recognized + if (LOG.isDebugEnabled()) { + String msg = "Failed to open file system for uri " + schemaFSUrl + " assuming it is not a FileSystem url"; + LOG.debug(msg, ioe); + } + + return null; + } + try { + in = fs.open(new Path(schemaFSUrl)); + Schema s = AvroSchemaUtils.getSchemaFor(in); + return s; + } finally { + if(in != null) in.close(); + } + } + + public static Schema getSchemaFor(File file) { + Schema.Parser parser = new Schema.Parser(); + Schema schema; + try { + schema = parser.parse(file); + } catch (IOException e) { + throw new RuntimeException("Failed to parse Avro schema from " + file.getName(), e); + } + return schema; + } + + public static Schema getSchemaFor(InputStream stream) { + Schema.Parser parser = new Schema.Parser(); + Schema schema; + try { + schema = parser.parse(stream); + } catch (IOException e) { + throw new RuntimeException("Failed to parse Avro schema", e); + } + return schema; + } + + public static Schema getSchemaFor(String str) { + Schema.Parser parser = new Schema.Parser(); + Schema schema = parser.parse(str); + return schema; + } + + public static Schema getSchemaFor(URL url) { + InputStream in = null; + try { + in = url.openStream(); + return getSchemaFor(in); + } catch (Exception e) { + throw new RuntimeException("Failed to parse Avro schema", e); + } finally { + if (in != null) { + try { + in.close(); + } catch (IOException e) { + // Ignore + } + } + } + } + + public static Schema getSchemaFromCols(Properties properties, + List columnNames, List columnTypes, String columnCommentProperty) { + List columnComments; + if (columnCommentProperty == null || columnCommentProperty.isEmpty()) { + columnComments = new ArrayList(); + } else { + //Comments are separated by "\0" in columnCommentProperty, see method getSchema + //in MetaStoreUtils where this string columns.comments is generated + columnComments = Arrays.asList(columnCommentProperty.split("\0")); + + if (LOG.isDebugEnabled()) { + LOG.debug("columnComments is " + columnCommentProperty); + } + } + if (columnNames.size() != columnTypes.size()) { + throw new IllegalArgumentException("getSchemaFromCols initialization failed. Number of column " + + "name and column type differs. columnNames = " + columnNames + ", columnTypes = " + + columnTypes); + } + + final String tableName = properties.getProperty(AvroSerDeConstants.TABLE_NAME); + final String tableComment = properties.getProperty(AvroSerDeConstants.TABLE_COMMENT); + TypeInfoToSchema metastoreTypeInfoToSchema = new TypeInfoToSchema(); + return metastoreTypeInfoToSchema.convert(columnNames, columnTypes, columnComments, + properties.getProperty(AvroTableProperties.SCHEMA_NAMESPACE.getPropName()), + properties.getProperty(AvroTableProperties.SCHEMA_NAME.getPropName(), tableName), + properties.getProperty(AvroTableProperties.SCHEMA_DOC.getPropName(), tableComment)); + + } + + /** + * Determine if an Avro schema is of type Union[T, NULL]. Avro supports nullable + * types via a union of type T and null. This is a very common use case. + * As such, we want to silently convert it to just T and allow the value to be null. + * + * When a Hive union type is used with AVRO, the schema type becomes + * Union[NULL, T1, T2, ...]. The NULL in the union should be silently removed + * + * @return true if type represents Union[T, Null], false otherwise + */ + public static boolean isNullableType(Schema schema) { + if (!schema.getType().equals(Schema.Type.UNION)) { + return false; + } + + List itemSchemas = schema.getTypes(); + if (itemSchemas.size() < 2) { + return false; + } + + for (Schema itemSchema : itemSchemas) { + if (Schema.Type.NULL.equals(itemSchema.getType())) { + return true; + } + } + + // [null, null] not allowed, so this check is ok. + return false; + } + + /** + * In a nullable type, get the schema for the non-nullable type. This method + * does no checking that the provides Schema is nullable. + */ + public static Schema getOtherTypeFromNullableType(Schema schema) { + List itemSchemas = new ArrayList<>(); + for (Schema itemSchema : schema.getTypes()) { + if (!Schema.Type.NULL.equals(itemSchema.getType())) { + itemSchemas.add(itemSchema); + } + } + + if (itemSchemas.size() > 1) { + return Schema.createUnion(itemSchemas); + } else { + return itemSchemas.get(0); + } + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/StorageSchemaUtils.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/StorageSchemaUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..5ec642f9b43e121de24c19970e06a94e735ad744 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/utils/StorageSchemaUtils.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore.utils; + +import org.apache.hadoop.hive.serde2.typeinfo.MetastoreTypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoParser; + +import java.util.List; + +public class StorageSchemaUtils { + public static final char COMMA = ','; + public static List getTypeInfosFromTypeString(String columnTypeProperty) { + return new TypeInfoParser(columnTypeProperty, MetastoreTypeInfoFactory.getInstance()) + .parseTypeInfos(); + } + + private static final String FROM_STORAGE_SCHEMA_READER = "generated by storage schema reader"; + public static String determineFieldComment(String comment) { + return (comment == null) ? FROM_STORAGE_SCHEMA_READER : comment; + } +} \ No newline at end of file diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/AvroSerDeConstants.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/AvroSerDeConstants.java new file mode 100644 index 0000000000000000000000000000000000000000..42868ea0f6cb6c6259e9b170d0788582337a68ab --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/AvroSerDeConstants.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +/** + * This class contains some of the constants which are specific to AvroSerDe + * They should always match with the constants defined in AvroSerDe.java in Hive Source code. These + * constants were copied as part of separating metastore from Hive. + */ +public class AvroSerDeConstants { + public static final String TABLE_NAME = "name"; + public static final String TABLE_COMMENT = "comment"; + public static final String LIST_COLUMN_COMMENTS = "columns.comments"; + + //it just so happens that the AVRO has these constants which are same as defined in ColumnType + //We should still keep it separate in case in future we need to separate the two + public static final String DECIMAL_TYPE_NAME = "decimal"; + public static final String CHAR_TYPE_NAME = "char"; + public static final String VARCHAR_TYPE_NAME = "varchar"; + public static final String DATE_TYPE_NAME = "date"; + + public static final String AVRO_TIMESTAMP_TYPE_NAME = "timestamp-millis"; + public static final String AVRO_PROP_LOGICAL_TYPE = "logicalType"; + public static final String AVRO_PROP_PRECISION = "precision"; + public static final String AVRO_PROP_SCALE = "scale"; + public static final String AVRO_PROP_MAX_LENGTH = "maxLength"; + public static final String AVRO_STRING_TYPE_NAME = "string"; + public static final String AVRO_INT_TYPE_NAME = "int"; + public static final String AVRO_LONG_TYPE_NAME = "long"; +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/InstanceCache.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/InstanceCache.java new file mode 100644 index 0000000000000000000000000000000000000000..12a8ff2266fbee118b11d0c3f18eab777211d23c --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/InstanceCache.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Cache for objects whose creation only depends on some other set of objects and therefore can be + * used against other equivalent versions of those objects. Essentially memoizes instance creation. + * + * @param Object that determines the instance. The cache uses this object as a key for + * its hash which is why it is imperative to have appropriate equals and hashcode + * implementation for this object for the cache to work properly + * @param Instance that will be created from SeedObject. + */ +public abstract class InstanceCache { + private static final Logger LOG = LoggerFactory.getLogger(InstanceCache.class); + Map cache = new HashMap(); + + public InstanceCache() {} + + /** + * Retrieve (or create if it doesn't exist) the correct Instance for this + * SeedObject + */ + public Instance retrieve(SeedObject hv) throws Exception { + return retrieve(hv, null); + } + + /** + * Retrieve (or create if it doesn't exist) the correct Instance for this + * SeedObject using 'seenSchemas' to resolve circular references + */ + public synchronized Instance retrieve(SeedObject hv, + Set seenSchemas) throws Exception { + if(LOG.isDebugEnabled()) LOG.debug("Checking for hv: " + hv.toString()); + + if(cache.containsKey(hv)) { + if(LOG.isDebugEnabled()) LOG.debug("Returning cache result."); + return cache.get(hv); + } + + if(LOG.isDebugEnabled()) LOG.debug("Creating new instance and storing in cache"); + + Instance instance = makeInstance(hv, seenSchemas); + cache.put(hv, instance); + return instance; + } + + protected abstract Instance makeInstance(SeedObject hv, + Set seenSchemas) throws Exception; +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaResolutionProblem.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaResolutionProblem.java new file mode 100644 index 0000000000000000000000000000000000000000..3470514c3a059a2dd4faa51173f781d778ab02b0 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaResolutionProblem.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import org.apache.avro.Schema; +import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate; +import org.apache.hadoop.hive.metastore.utils.AvroSchemaUtils; + +@LimitedPrivate("Hive") +public class SchemaResolutionProblem { + static final String sentinelString = "{\n" + + " \"namespace\": \"org.apache.hadoop.hive\",\n" + + " \"name\": \"CannotDetermineSchemaSentinel\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"ERROR_ERROR_ERROR_ERROR_ERROR_ERROR_ERROR\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"Cannot_determine_schema\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"check\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"schema\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"url\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"and\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"literal\",\n" + + " \"type\":\"string\"\n" + + " }\n" + + " ]\n" + + "}"; + public final static Schema SIGNAL_BAD_SCHEMA = AvroSchemaUtils.getSchemaFor(sentinelString); +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaToMetastoreTypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaToMetastoreTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..b40a2411a86b7c76bb01c9b0f81007ec3af55a89 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaToMetastoreTypeInfo.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import org.apache.hadoop.hive.serde2.typeinfo.MetastoreTypeInfoFactory; + +public class SchemaToMetastoreTypeInfo extends SchemaToTypeInfo { + private static final SchemaToMetastoreTypeInfo instance = new SchemaToMetastoreTypeInfo(); + private SchemaToMetastoreTypeInfo() { + super(MetastoreTypeInfoFactory.getInstance()); + } + + public static final SchemaToMetastoreTypeInfo getInstance() { + return instance; + } +} \ No newline at end of file diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..c49e890a9deca9342693ac88d0b7df11e4ff54b6 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import static org.apache.avro.Schema.Type.BOOLEAN; +import static org.apache.avro.Schema.Type.BYTES; +import static org.apache.avro.Schema.Type.DOUBLE; +import static org.apache.avro.Schema.Type.FIXED; +import static org.apache.avro.Schema.Type.FLOAT; +import static org.apache.avro.Schema.Type.INT; +import static org.apache.avro.Schema.Type.LONG; +import static org.apache.avro.Schema.Type.NULL; +import static org.apache.avro.Schema.Type.STRING; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Hashtable; +import java.util.IdentityHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.avro.Schema; +import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate; +import org.apache.hadoop.hive.metastore.ColumnType; +import org.apache.hadoop.hive.metastore.utils.AvroSchemaUtils; +import org.apache.hadoop.hive.serde2.typeinfo.ITypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.MetastoreTypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * Convert an Avro Schema to a Hive TypeInfo + */ +@LimitedPrivate("Hive") +public abstract class SchemaToTypeInfo { + // Conversion of Avro primitive types to Hive primitive types + // Avro Hive + // Null + // boolean boolean check + // int int check + // long bigint check + // float double check + // double double check + // bytes binary check + // fixed binary check + // string string check + // tinyint + // smallint + + // Map of Avro's primitive types to Hives (for those that are supported by both) + private final Map primitiveTypeToTypeInfo; + private final ITypeInfoFactory typeInfoFactory; + SchemaToTypeInfo(ITypeInfoFactory typeInfoFactory) { + this.typeInfoFactory = typeInfoFactory; + primitiveTypeToTypeInfo = initTypeMap(); + } + + private Map initTypeMap() { + Map theMap = new Hashtable(); + theMap.put(NULL, typeInfoFactory.getPrimitiveTypeInfo("void")); + theMap.put(BOOLEAN, typeInfoFactory.getPrimitiveTypeInfo("boolean")); + theMap.put(INT, typeInfoFactory.getPrimitiveTypeInfo("int")); + theMap.put(LONG, typeInfoFactory.getPrimitiveTypeInfo("bigint")); + theMap.put(FLOAT, typeInfoFactory.getPrimitiveTypeInfo("float")); + theMap.put(DOUBLE, typeInfoFactory.getPrimitiveTypeInfo("double")); + theMap.put(BYTES, typeInfoFactory.getPrimitiveTypeInfo("binary")); + theMap.put(FIXED, typeInfoFactory.getPrimitiveTypeInfo("binary")); + theMap.put(STRING, typeInfoFactory.getPrimitiveTypeInfo("string")); + return Collections.unmodifiableMap(theMap); + } + + /** + * Generate a list of of TypeInfos from an Avro schema. This method is + * currently public due to some weirdness in deserializing unions, but + * will be made private once that is resolved. + * @param schema Schema to generate field types for + * @return List of TypeInfos, each element of which is a TypeInfo derived + * from the schema. + * @throws Exception for problems during conversion. + */ + public List generateColumnTypes(Schema schema) throws Exception { + return generateColumnTypes (schema, null); + } + + /** + * Generate a list of of TypeInfos from an Avro schema. This method is + * currently public due to some weirdness in deserializing unions, but + * will be made private once that is resolved. + * @param schema Schema to generate field types for + * @param seenSchemas stores schemas processed in the parsing done so far, + * helping to resolve circular references in the schema + * @return List of TypeInfos, each element of which is a TypeInfo derived + * from the schema. + * @throws Exception for problems during conversion. + */ + public List generateColumnTypes(Schema schema, + Set seenSchemas) throws Exception { + List fields = schema.getFields(); + + List types = new ArrayList(fields.size()); + + for (Schema.Field field : fields) { + types.add(generateTypeInfo(field.schema(), seenSchemas)); + } + + return types; + } + + private InstanceCache typeInfoCache = new InstanceCache() { + @Override + protected TypeInfo makeInstance(Schema s, + Set seenSchemas) + throws Exception { + return generateTypeInfoWorker(s, seenSchemas); + } + }; + /** + * Convert an Avro Schema into an equivalent Hive TypeInfo. + * @param schema to record. Must be of record type. + * @param seenSchemas stores schemas processed in the parsing done so far, + * helping to resolve circular references in the schema + * @return TypeInfo matching the Avro schema + * @throws Exception for any problems during conversion. + */ + public TypeInfo generateTypeInfo(Schema schema, + Set seenSchemas) throws Exception { + // For bytes type, it can be mapped to decimal. + Schema.Type type = schema.getType(); + if (type == BYTES && AvroSerDeConstants.DECIMAL_TYPE_NAME + .equalsIgnoreCase(schema.getProp(AvroSerDeConstants.AVRO_PROP_LOGICAL_TYPE))) { + int precision = 0; + int scale = 0; + try { + precision = schema.getJsonProp(AvroSerDeConstants.AVRO_PROP_PRECISION).getIntValue(); + scale = schema.getJsonProp(AvroSerDeConstants.AVRO_PROP_SCALE).getIntValue(); + } catch (Exception ex) { + throw new Exception("Failed to obtain scale value from file schema: " + schema, ex); + } + + try { + MetastoreTypeInfoUtils.validateDecimalParameters(precision, scale); + } catch (Exception ex) { + throw new Exception("Invalid precision or scale for decimal type", ex); + } + + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.DECIMAL_TYPE_NAME, precision, scale); + } + + if (type == STRING && AvroSerDeConstants.CHAR_TYPE_NAME + .equalsIgnoreCase(schema.getProp(AvroSerDeConstants.AVRO_PROP_LOGICAL_TYPE))) { + int maxLength = 0; + try { + maxLength = schema.getJsonProp(AvroSerDeConstants.AVRO_PROP_MAX_LENGTH).getValueAsInt(); + } catch (Exception ex) { + throw new Exception("Failed to obtain maxLength value from file schema: " + schema, ex); + } + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.CHAR_TYPE_NAME, maxLength); + } + + if (type == STRING && AvroSerDeConstants.VARCHAR_TYPE_NAME + .equalsIgnoreCase(schema.getProp(AvroSerDeConstants.AVRO_PROP_LOGICAL_TYPE))) { + int maxLength = 0; + try { + maxLength = schema.getJsonProp(AvroSerDeConstants.AVRO_PROP_MAX_LENGTH).getValueAsInt(); + } catch (Exception ex) { + throw new Exception("Failed to obtain maxLength value from file schema: " + schema, ex); + } + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.VARCHAR_TYPE_NAME, maxLength); + } + + if (type == INT && + AvroSerDeConstants.DATE_TYPE_NAME.equals(schema.getProp(AvroSerDeConstants.AVRO_PROP_LOGICAL_TYPE))) { + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.DATE_TYPE_NAME); + } + + if (type == LONG && + AvroSerDeConstants.AVRO_TIMESTAMP_TYPE_NAME.equals(schema.getProp(AvroSerDeConstants.AVRO_PROP_LOGICAL_TYPE))) { + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.TIMESTAMP_TYPE_NAME); + } + + return typeInfoCache.retrieve(schema, seenSchemas); + } + + private TypeInfo generateTypeInfoWorker(Schema schema, + Set seenSchemas) throws Exception { + // Avro requires NULLable types to be defined as unions of some type TypeInfo + // and NULL. This is annoying and we're going to hide it from the user. + if(AvroSchemaUtils.isNullableType(schema)) { + return generateTypeInfo( + AvroSchemaUtils.getOtherTypeFromNullableType(schema), seenSchemas); + } + + Schema.Type type = schema.getType(); + if(primitiveTypeToTypeInfo.containsKey(type)) { + return primitiveTypeToTypeInfo.get(type); + } + + switch(type) { + case RECORD: return generateRecordTypeInfo(schema, seenSchemas); + case MAP: return generateMapTypeInfo(schema, seenSchemas); + case ARRAY: return generateArrayTypeInfo(schema, seenSchemas); + case UNION: return generateUnionTypeInfo(schema, seenSchemas); + case ENUM: return generateEnumTypeInfo(schema); + default: throw new Exception("Do not yet support: " + schema); + } + } + + private TypeInfo generateRecordTypeInfo(Schema schema, + Set seenSchemas) throws Exception { + assert schema.getType().equals(Schema.Type.RECORD); + + if (seenSchemas == null) { + seenSchemas = Collections.newSetFromMap(new IdentityHashMap()); + } else if (seenSchemas.contains(schema)) { + throw new Exception( + "Recursive schemas are not supported. Recursive schema was " + schema + .getFullName()); + } + seenSchemas.add(schema); + + List fields = schema.getFields(); + List fieldNames = new ArrayList(fields.size()); + List typeInfos = new ArrayList(fields.size()); + + for(int i = 0; i < fields.size(); i++) { + fieldNames.add(i, fields.get(i).name()); + typeInfos.add(i, generateTypeInfo(fields.get(i).schema(), seenSchemas)); + } + + return typeInfoFactory.getStructTypeInfo(fieldNames, typeInfos); + } + + /** + * Generate a TypeInfo for an Avro Map. This is made slightly simpler in that + * Avro only allows maps with strings for keys. + */ + private TypeInfo generateMapTypeInfo(Schema schema, + Set seenSchemas) throws Exception { + assert schema.getType().equals(Schema.Type.MAP); + Schema valueType = schema.getValueType(); + TypeInfo ti = generateTypeInfo(valueType, seenSchemas); + + return typeInfoFactory + .getMapTypeInfo(typeInfoFactory.getPrimitiveTypeInfo(ColumnType.STRING_TYPE_NAME), ti); + } + + private TypeInfo generateArrayTypeInfo(Schema schema, + Set seenSchemas) throws Exception { + assert schema.getType().equals(Schema.Type.ARRAY); + Schema itemsType = schema.getElementType(); + TypeInfo itemsTypeInfo = generateTypeInfo(itemsType, seenSchemas); + + return typeInfoFactory.getListTypeInfo(itemsTypeInfo); + } + + private TypeInfo generateUnionTypeInfo(Schema schema, + Set seenSchemas) throws Exception { + assert schema.getType().equals(Schema.Type.UNION); + List types = schema.getTypes(); + + + List typeInfos = new ArrayList(types.size()); + + for(Schema type : types) { + typeInfos.add(generateTypeInfo(type, seenSchemas)); + } + + return typeInfoFactory.getUnionTypeInfo(typeInfos); + } + + // Hive doesn't have an Enum type, so we're going to treat them as Strings. + // During the deserialize/serialize stage we'll check for enumness and + // convert as such. + private TypeInfo generateEnumTypeInfo(Schema schema) { + assert schema.getType().equals(Schema.Type.ENUM); + + return typeInfoFactory.getPrimitiveTypeInfo("string"); + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java new file mode 100644 index 0000000000000000000000000000000000000000..3328caf7c304f98f4c9bffcd6556ff6c85a4c93f --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java @@ -0,0 +1,278 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import org.apache.avro.Schema; +import org.apache.hadoop.hive.metastore.ColumnType; +import org.apache.hadoop.hive.metastore.utils.AvroSchemaUtils; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MetastorePrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MetastoreTypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; +import org.codehaus.jackson.JsonNode; +import org.codehaus.jackson.node.JsonNodeFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Convert Hive TypeInfo to an Avro Schema + */ +public class TypeInfoToSchema { + + private long recordCounter = 0; + + /** + * Converts Hive schema to avro schema + * + * @param columnNames Names of the hive columns + * @param columnTypes Hive Column types + * @param namespace Namespace of Avro schema + * @param name Avro schema name + * @param doc Avro schema doc + * @return Avro Schema + */ + public Schema convert(List columnNames, List columnTypes, + List columnComments, String namespace, String name, String doc) { + + List fields = new ArrayList(); + for (int i = 0; i < columnNames.size(); ++i) { + final String comment = columnComments.size() > i ? columnComments.get(i) : null; + final Schema.Field avroField = createAvroField(columnNames.get(i), columnTypes.get(i), + comment); + fields.addAll(getFields(avroField)); + } + + if (name == null || name.isEmpty()) { + name = "baseRecord"; + } + + Schema avroSchema = Schema.createRecord(name, doc, namespace, false); + avroSchema.setFields(fields); + return avroSchema; + } + + private Schema.Field createAvroField(String name, TypeInfo typeInfo, String comment) { + return new Schema.Field(name, createAvroSchema(typeInfo), comment, null); + } + + private Schema createAvroSchema(TypeInfo typeInfo) { + Schema schema = null; + switch (typeInfo.getCategory()) { + case PRIMITIVE: + schema = createAvroPrimitive(typeInfo); + break; + case LIST: + schema = createAvroArray(typeInfo); + break; + case MAP: + schema = createAvroMap(typeInfo); + break; + case STRUCT: + schema = createAvroRecord(typeInfo); + break; + case UNION: + schema = createAvroUnion(typeInfo); + break; + } + + return wrapInUnionWithNull(schema); + } + + private Schema createAvroPrimitive(TypeInfo typeInfo) { + Schema schema; + String baseTypeName = MetastoreTypeInfoUtils.getBaseName(typeInfo.getTypeName()); + switch (baseTypeName) { + case ColumnType.STRING_TYPE_NAME: + schema = Schema.create(Schema.Type.STRING); + break; + case ColumnType.CHAR_TYPE_NAME: + schema = AvroSchemaUtils.getSchemaFor("{" + + "\"type\":\"" + AvroSerDeConstants.AVRO_STRING_TYPE_NAME + "\"," + + "\"logicalType\":\"" + AvroSerDeConstants.CHAR_TYPE_NAME + "\"," + + "\"maxLength\":" + ((MetastorePrimitiveTypeInfo) typeInfo).getParameters()[0] + "}"); + break; + case ColumnType.VARCHAR_TYPE_NAME: + schema = AvroSchemaUtils.getSchemaFor("{" + + "\"type\":\"" + AvroSerDeConstants.AVRO_STRING_TYPE_NAME + "\"," + + "\"logicalType\":\"" + AvroSerDeConstants.VARCHAR_TYPE_NAME + "\"," + + "\"maxLength\":" + ((MetastorePrimitiveTypeInfo) typeInfo).getParameters()[0] + "}"); + break; + case ColumnType.BINARY_TYPE_NAME: + schema = Schema.create(Schema.Type.BYTES); + break; + case ColumnType.TINYINT_TYPE_NAME: + schema = Schema.create(Schema.Type.INT); + break; + case ColumnType.SMALLINT_TYPE_NAME: + schema = Schema.create(Schema.Type.INT); + break; + case ColumnType.INT_TYPE_NAME: + schema = Schema.create(Schema.Type.INT); + break; + case ColumnType.BIGINT_TYPE_NAME: + schema = Schema.create(Schema.Type.LONG); + break; + case ColumnType.FLOAT_TYPE_NAME: + schema = Schema.create(Schema.Type.FLOAT); + break; + case ColumnType.DOUBLE_TYPE_NAME: + schema = Schema.create(Schema.Type.DOUBLE); + break; + case ColumnType.BOOLEAN_TYPE_NAME: + schema = Schema.create(Schema.Type.BOOLEAN); + break; + case ColumnType.DECIMAL_TYPE_NAME: + String precision = String.valueOf(((MetastorePrimitiveTypeInfo) typeInfo).getParameters()[0]); + String scale = String.valueOf(((MetastorePrimitiveTypeInfo) typeInfo).getParameters()[1]); + schema = AvroSchemaUtils.getSchemaFor("{" + + "\"type\":\"bytes\"," + + "\"logicalType\":\"decimal\"," + + "\"precision\":" + precision + "," + + "\"scale\":" + scale + "}"); + break; + case ColumnType.DATE_TYPE_NAME: + schema = AvroSchemaUtils.getSchemaFor("{" + + "\"type\":\"" + AvroSerDeConstants.AVRO_INT_TYPE_NAME + "\"," + + "\"logicalType\":\"" + AvroSerDeConstants.DATE_TYPE_NAME + "\"}"); + break; + case ColumnType.TIMESTAMP_TYPE_NAME: + schema = AvroSchemaUtils.getSchemaFor("{" + + "\"type\":\"" + AvroSerDeConstants.AVRO_LONG_TYPE_NAME + "\"," + + "\"logicalType\":\"" + AvroSerDeConstants.AVRO_TIMESTAMP_TYPE_NAME + "\"}"); + break; + case ColumnType.VOID_TYPE_NAME: + schema = Schema.create(Schema.Type.NULL); + break; + default: + throw new UnsupportedOperationException(typeInfo + " is not supported."); + } + return schema; + } + + private Schema createAvroUnion(TypeInfo typeInfo) { + List childSchemas = new ArrayList(); + for (TypeInfo childTypeInfo : ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos()) { + final Schema childSchema = createAvroSchema(childTypeInfo); + if (childSchema.getType() == Schema.Type.UNION) { + childSchemas.addAll(childSchema.getTypes()); + } else { + childSchemas.add(childSchema); + } + } + + return Schema.createUnion(removeDuplicateNullSchemas(childSchemas)); + } + + private Schema createAvroRecord(TypeInfo typeInfo) { + List childFields = new ArrayList(); + final List allStructFieldNames = ((StructTypeInfo) typeInfo).getAllStructFieldNames(); + final List allStructFieldTypeInfos = + ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos(); + if (allStructFieldNames.size() != allStructFieldTypeInfos.size()) { + throw new IllegalArgumentException("Failed to generate avro schema from hive schema. " + + "name and column type differs. names = " + allStructFieldNames + ", types = " + + allStructFieldTypeInfos); + } + + for (int i = 0; i < allStructFieldNames.size(); ++i) { + final TypeInfo childTypeInfo = allStructFieldTypeInfos.get(i); + final Schema.Field grandChildSchemaField = createAvroField(allStructFieldNames.get(i), + childTypeInfo, childTypeInfo.toString()); + final List grandChildFields = getFields(grandChildSchemaField); + childFields.addAll(grandChildFields); + } + + Schema recordSchema = Schema.createRecord("record_" + recordCounter, typeInfo.toString(), + null, false); + ++recordCounter; + recordSchema.setFields(childFields); + return recordSchema; + } + + private Schema createAvroMap(TypeInfo typeInfo) { + TypeInfo keyTypeInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo(); + if (!ColumnType.STRING_TYPE_NAME.equals(keyTypeInfo.getTypeName())) { + throw new UnsupportedOperationException("Key of Map can only be a String"); + } + + TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo(); + Schema valueSchema = createAvroSchema(valueTypeInfo); + + return Schema.createMap(valueSchema); + } + + private Schema createAvroArray(TypeInfo typeInfo) { + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + Schema listSchema = createAvroSchema(listTypeInfo.getListElementTypeInfo()); + return Schema.createArray(listSchema); + } + + private List getFields(Schema.Field schemaField) { + List fields = new ArrayList(); + + JsonNode nullDefault = JsonNodeFactory.instance.nullNode(); + if (schemaField.schema().getType() == Schema.Type.RECORD) { + for (Schema.Field field : schemaField.schema().getFields()) { + fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), nullDefault)); + } + } else { + fields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), + nullDefault)); + } + + return fields; + } + + private Schema wrapInUnionWithNull(Schema schema) { + Schema wrappedSchema = schema; + switch (schema.getType()) { + case NULL: + break; + case UNION: + List existingSchemas = removeDuplicateNullSchemas(schema.getTypes()); + wrappedSchema = Schema.createUnion(existingSchemas); + break; + default: + wrappedSchema = Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), schema)); + } + + return wrappedSchema; + } + + private List removeDuplicateNullSchemas(List childSchemas) { + List prunedSchemas = new ArrayList(); + boolean isNullPresent = false; + for (Schema schema : childSchemas) { + if (schema.getType() == Schema.Type.NULL) { + isNullPresent = true; + } else { + prunedSchemas.add(schema); + } + } + if (isNullPresent) { + prunedSchemas.add(0, Schema.create(Schema.Type.NULL)); + } + + return prunedSchemas; + } +} \ No newline at end of file diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/ITypeInfoFactory.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/ITypeInfoFactory.java new file mode 100644 index 0000000000000000000000000000000000000000..24edf70f3098cb38652101f54c7e09243a9dd6e8 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/ITypeInfoFactory.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate; + +import java.util.List; + +@LimitedPrivate("Hive") +public interface ITypeInfoFactory { + /** + * Get or create a Primitive TypeInfo object of name typeName and parameters provided by + * paramaters. Eg. a primitive typeInfo of char(10) can be represented a typename --> char + * and 10 as the parameter. Similarly, decimal(10,2) has typename decimal and 10,2 as + * parameters + * + * @param typeName name of the type + * @param parameters optional parameters in case of parameterized primitive types + * @return TypeInfo representing the primitive typeInfo + */ + MetastorePrimitiveTypeInfo getPrimitiveTypeInfo(String typeName, Object... parameters); + + /** + * Get or create a Map type TypeInfo + * + * @param keyTypeInfo TypeInfo for the key + * @param valueTypeInfo TypeInfo for the value + * @return MapTypeInfo + */ + MapTypeInfo getMapTypeInfo(TypeInfo keyTypeInfo, TypeInfo valueTypeInfo); + + /** + * Get or create a List type TypeInfo + * + * @param listElementTypeInfo TypeInfo of the list elements + * @return ListTypeInfo + */ + ListTypeInfo getListTypeInfo(TypeInfo listElementTypeInfo); + + /** + * Get or create a UnionTypeInfo + * + * @param typeInfos child TypeInfos for the UnionTypeInfo + * @return UnionTypeInfo + */ + UnionTypeInfo getUnionTypeInfo(List typeInfos); + + /** + * Get or create a StructTypeInfo + * + * @param names names of the fields in the struct typeInfo + * @param typeInfos TypeInfos for each fields + * @return StructTypeInfo + */ + StructTypeInfo getStructTypeInfo(List names, List typeInfos); +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/ListTypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/ListTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..6e43ad7886d5661fe5accee66e9d05ca3eeb090c --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/ListTypeInfo.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hive.metastore.ColumnType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +import java.io.Serializable; + + +/** + * A List Type has homogeneous elements. All elements of the List has the same + * TypeInfo which is returned by getListElementTypeInfo. + * + * Always use the TypeInfoFactory to create new TypeInfo objects, instead of + * directly creating an instance of this class. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public final class ListTypeInfo extends TypeInfo implements Serializable { + + private static final long serialVersionUID = 1L; + private TypeInfo listElementTypeInfo; + + /** + * For java serialization use only. + */ + public ListTypeInfo() { + } + + @Override + public String getTypeName() { + return ColumnType.LIST_TYPE_NAME + "<" + + listElementTypeInfo.getTypeName() + ">"; + } + + /** + * For java serialization use only. + */ + public void setListElementTypeInfo(TypeInfo listElementTypeInfo) { + this.listElementTypeInfo = listElementTypeInfo; + } + + /** + * For TypeInfoFactory use only. + */ + ListTypeInfo(TypeInfo elementTypeInfo) { + listElementTypeInfo = elementTypeInfo; + } + + @Override + public Category getCategory() { + return ObjectInspector.Category.LIST; + } + + public TypeInfo getListElementTypeInfo() { + return listElementTypeInfo; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (!(other instanceof ListTypeInfo)) { + return false; + } + return getListElementTypeInfo().equals( + ((ListTypeInfo) other).getListElementTypeInfo()); + } + + @Override + public int hashCode() { + return listElementTypeInfo.hashCode(); + } + +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MapTypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MapTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..82cceaf0ca820d6a4522be71baa479ac0c1d21a1 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MapTypeInfo.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hive.metastore.ColumnType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +import java.io.Serializable; + + +/** + * A Map Type has homogeneous keys and homogeneous values. All keys of the Map + * have the same TypeInfo, which is returned by getMapKeyTypeInfo(); and all + * values of the Map has the same TypeInfo, which is returned by + * getMapValueTypeInfo(). + * + * Always use the TypeInfoFactory to create new TypeInfo objects, instead of + * directly creating an instance of this class. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public final class MapTypeInfo extends TypeInfo implements Serializable { + + private static final long serialVersionUID = 1L; + + private TypeInfo mapKeyTypeInfo; + private TypeInfo mapValueTypeInfo; + + /** + * For java serialization use only. + */ + public MapTypeInfo() { + } + + @Override + public String getTypeName() { + return ColumnType.MAP_TYPE_NAME + "<" + + mapKeyTypeInfo.getTypeName() + "," + mapValueTypeInfo.getTypeName() + + ">"; + } + + /** + * For java serialization use only. + */ + public void setMapKeyTypeInfo(TypeInfo mapKeyTypeInfo) { + this.mapKeyTypeInfo = mapKeyTypeInfo; + } + + /** + * For java serialization use only. + */ + public void setMapValueTypeInfo(TypeInfo mapValueTypeInfo) { + this.mapValueTypeInfo = mapValueTypeInfo; + } + + // For TypeInfoFactory use only + MapTypeInfo(TypeInfo keyTypeInfo, TypeInfo valueTypeInfo) { + mapKeyTypeInfo = keyTypeInfo; + mapValueTypeInfo = valueTypeInfo; + } + + @Override + public Category getCategory() { + return ObjectInspector.Category.MAP; + } + + public TypeInfo getMapKeyTypeInfo() { + return mapKeyTypeInfo; + } + + public TypeInfo getMapValueTypeInfo() { + return mapValueTypeInfo; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (!(other instanceof MapTypeInfo)) { + return false; + } + MapTypeInfo o = (MapTypeInfo) other; + return o.getMapKeyTypeInfo().equals(getMapKeyTypeInfo()) + && o.getMapValueTypeInfo().equals(getMapValueTypeInfo()); + } + + @Override + public int hashCode() { + return mapKeyTypeInfo.hashCode() ^ mapValueTypeInfo.hashCode(); + } + +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastorePrimitiveTypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastorePrimitiveTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..566105d195dff8da51acb08491d5aa1052e38b69 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastorePrimitiveTypeInfo.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +import java.io.Serializable; + +/** + * This class represents a PrimitiveTypeInfo. Hive extends this class to create PrimitiveTypeInfo + */ +@LimitedPrivate("Hive") +public class MetastorePrimitiveTypeInfo extends TypeInfo implements Serializable { + // Base name (varchar vs fully qualified name such as varchar(200)). + protected String typeName; + + public MetastorePrimitiveTypeInfo() { + } + + public MetastorePrimitiveTypeInfo(String typeName) { + this.typeName = typeName; + } + + // The following 2 methods are for java serialization use only. + public void setTypeName(String typeName) { + this.typeName = typeName; + } + + @Override + public String getTypeName() { + return typeName; + } + + @Override + public Category getCategory() { + return Category.PRIMITIVE; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + + MetastorePrimitiveTypeInfo pti = (MetastorePrimitiveTypeInfo) other; + + return this.typeName.equals(pti.typeName); + } + + /** + * Generate the hashCode for this TypeInfo. + */ + @Override + public int hashCode() { + return typeName.hashCode(); + } + + @Override + public String toString() { + return typeName; + } + + private static final Object[] EMPTY_OBJECT_ARRAY = new Object[0]; + + /** + * parameterized TypeInfos should override this to return array of parameters + * @return + */ + public Object[] getParameters() { + //default is no parameters + return EMPTY_OBJECT_ARRAY; + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastoreTypeInfoFactory.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastoreTypeInfoFactory.java new file mode 100644 index 0000000000000000000000000000000000000000..de10641b164f3c56fa4b67f89e0bc28bdad29ef3 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastoreTypeInfoFactory.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.hive.metastore.ColumnType; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; + +public class MetastoreTypeInfoFactory implements ITypeInfoFactory { + + private static final MetastoreTypeInfoFactory instance = new MetastoreTypeInfoFactory(); + + public static final MetastoreTypeInfoFactory getInstance() { + return instance; + } + private static ConcurrentHashMap cachedPrimitiveTypeInfo = + new ConcurrentHashMap<>(); + + @Override + public MetastorePrimitiveTypeInfo getPrimitiveTypeInfo(String typeName, Object... parameters) { + String qualifiedTypeName = MetastoreTypeInfoUtils + .getQualifiedPrimitiveTypeName(typeName, parameters); + MetastorePrimitiveTypeInfo result = cachedPrimitiveTypeInfo.get(qualifiedTypeName); + if (result != null) { + return result; + } + + if (ColumnType.CHAR_TYPE_NAME.equals(typeName) || ColumnType.VARCHAR_TYPE_NAME + .equals(typeName)) { + MetastoreTypeInfoUtils.validateCharVarCharParameters((int) parameters[0]); + } else if (ColumnType.DECIMAL_TYPE_NAME.equals(typeName)) { + MetastoreTypeInfoUtils.validateDecimalParameters((int) parameters[0], (int) parameters[1]); + } + // Not found in the cache. Must be parameterized types. Create it. + result = new MetastorePrimitiveTypeInfo(qualifiedTypeName); + + MetastorePrimitiveTypeInfo prev = cachedPrimitiveTypeInfo.putIfAbsent(qualifiedTypeName, result); + if (prev != null) { + result = prev; + } + return result; + } + + private static ConcurrentHashMap, MapTypeInfo> cachedMapTypeInfo = + new ConcurrentHashMap<>(); + @Override + public MapTypeInfo getMapTypeInfo(TypeInfo keyTypeInfo, + TypeInfo valueTypeInfo) { + ArrayList signature = new ArrayList(2); + signature.add(keyTypeInfo); + signature.add(valueTypeInfo); + MapTypeInfo result = cachedMapTypeInfo.get(signature); + if (result == null) { + result = new MapTypeInfo(keyTypeInfo, valueTypeInfo); + MapTypeInfo prev = cachedMapTypeInfo.putIfAbsent(signature, result); + if (prev != null) { + result = prev; + } + } + return result; + } + + private static ConcurrentHashMap cachedListTypeInfo = new ConcurrentHashMap<>(); + + @Override + public ListTypeInfo getListTypeInfo(TypeInfo listElementTypeInfo) { + ListTypeInfo result = cachedListTypeInfo.get(listElementTypeInfo); + if (result == null) { + result = new ListTypeInfo(listElementTypeInfo); + ListTypeInfo prev = cachedListTypeInfo.putIfAbsent(listElementTypeInfo, result); + if (prev != null) { + result = prev; + } + } + return result; + } + + private static ConcurrentHashMap, UnionTypeInfo> cachedUnionTypeInfo = + new ConcurrentHashMap<>(); + + @Override + public UnionTypeInfo getUnionTypeInfo(List typeInfos) { + UnionTypeInfo result = cachedUnionTypeInfo.get(typeInfos); + if (result == null) { + result = new UnionTypeInfo(typeInfos); + UnionTypeInfo prev = cachedUnionTypeInfo.putIfAbsent(typeInfos, result); + if (prev != null) { + result = prev; + } + } + return result; + } + static ConcurrentHashMap>, StructTypeInfo> cachedStructTypeInfo = + new ConcurrentHashMap<>(); + @Override + public StructTypeInfo getStructTypeInfo(List names, + List typeInfos) { + ArrayList> signature = new ArrayList>(2); + signature.add(names); + signature.add(typeInfos); + StructTypeInfo result = cachedStructTypeInfo.get(signature); + if (result == null) { + result = new StructTypeInfo(names, typeInfos); + StructTypeInfo prev = cachedStructTypeInfo.putIfAbsent(signature, result); + if (prev != null) { + result = prev; + } + } + return result; + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastoreTypeInfoUtils.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastoreTypeInfoUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..780dc50be6d71cf73979352f26843f4f4e7c79dc --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/MetastoreTypeInfoUtils.java @@ -0,0 +1,59 @@ +package org.apache.hadoop.hive.serde2.typeinfo; + +public class MetastoreTypeInfoUtils { + private MetastoreTypeInfoUtils() { + } + + /** + * Metastore is not supposed to enforce type ranges. The type range checks should be left + * to the implementation engines. This method does a very lenient check which is obvious + * and makes sense for overall sanity of decimal types + * @param precision decimal precision value + * @param scale decimal scale value + */ + public static void validateDecimalParameters(int precision, int scale) { + if (precision < 0) { + throw new IllegalArgumentException("Precision cannot be negative"); + } + if (scale < 0) { + throw new IllegalArgumentException("Scale cannot be negative"); + } + } + + /** + * Metastore is not supposed to enforce type ranges. The type range checks should be left + * to the implementation engines. This method does a very lenient check which is obvious + * and makes sense for overall sanity of char types + * @param length + */ + public static void validateCharVarCharParameters(int length) { + if (length < 0) { + throw new IllegalArgumentException("Length cannot be negative"); + } + } + + static String getQualifiedPrimitiveTypeName(String type, Object... parameters) { + StringBuilder sb = new StringBuilder(type); + if (parameters == null || parameters.length == 0) { + return sb.toString(); + } + sb.append('('); + for (int i = 0; i < parameters.length; i++) { + sb.append(parameters[i]); + if (i != (parameters.length - 1)) { + sb.append(','); + } + } + sb.append(')'); + return sb.toString(); + } + + public static String getBaseName(String typeName) { + int idx = typeName.indexOf('('); + if (idx == -1) { + return typeName; + } else { + return typeName.substring(0, idx); + } + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/StructTypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/StructTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..064fd57c3ce9ad36da540549e23db66b15354765 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/StructTypeInfo.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.typeinfo; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hive.metastore.ColumnType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +/** + * StructTypeInfo represents the TypeInfo of a struct. A struct contains one or + * more fields each of which has a unique name and its own TypeInfo. Different + * fields can have the same or different TypeInfo. + * + * Always use the TypeInfoFactory to create new TypeInfo objects, instead of + * directly creating an instance of this class. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public final class StructTypeInfo extends TypeInfo implements Serializable { + + private static final long serialVersionUID = 1L; + + private ArrayList allStructFieldNames; + private ArrayList allStructFieldTypeInfos; + + /** + * For java serialization use only. + */ + public StructTypeInfo() { + } + + @Override + public String getTypeName() { + StringBuilder sb = new StringBuilder(); + sb.append(ColumnType.STRUCT_TYPE_NAME + "<"); + for (int i = 0; i < allStructFieldNames.size(); i++) { + if (i > 0) { + sb.append(","); + } + sb.append(allStructFieldNames.get(i)); + sb.append(":"); + sb.append(allStructFieldTypeInfos.get(i).getTypeName()); + } + sb.append(">"); + return sb.toString(); + } + + /** + * For java serialization use only. + */ + public void setAllStructFieldNames(ArrayList allStructFieldNames) { + this.allStructFieldNames = allStructFieldNames; + } + + /** + * For java serialization use only. + */ + public void setAllStructFieldTypeInfos( + ArrayList allStructFieldTypeInfos) { + this.allStructFieldTypeInfos = allStructFieldTypeInfos; + } + + /** + * For TypeInfoFactory use only. + */ + StructTypeInfo(List names, List typeInfos) { + allStructFieldNames = new ArrayList(names); + allStructFieldTypeInfos = new ArrayList(typeInfos); + } + + @Override + public Category getCategory() { + return ObjectInspector.Category.STRUCT; + } + + public ArrayList getAllStructFieldNames() { + return allStructFieldNames; + } + + public ArrayList getAllStructFieldTypeInfos() { + return allStructFieldTypeInfos; + } + + public TypeInfo getStructFieldTypeInfo(String field) { + String fieldLowerCase = field.toLowerCase(); + for (int i = 0; i < allStructFieldNames.size(); i++) { + if (fieldLowerCase.equalsIgnoreCase(allStructFieldNames.get(i))) { + return allStructFieldTypeInfos.get(i); + } + } + throw new RuntimeException("cannot find field " + field + + "(lowercase form: " + fieldLowerCase + ") in " + allStructFieldNames); + // return null; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (!(other instanceof StructTypeInfo)) { + return false; + } + StructTypeInfo o = (StructTypeInfo) other; + Iterator namesIterator = getAllStructFieldNames().iterator(); + Iterator otherNamesIterator = o.getAllStructFieldNames().iterator(); + + // Compare the field names using ignore-case semantics + while (namesIterator.hasNext() && otherNamesIterator.hasNext()) { + if (!namesIterator.next().equalsIgnoreCase(otherNamesIterator.next())) { + return false; + } + } + + // Different number of field names + if (namesIterator.hasNext() || otherNamesIterator.hasNext()) { + return false; + } + + // Compare the field types + return o.getAllStructFieldTypeInfos().equals(getAllStructFieldTypeInfos()); + } + + @Override + public int hashCode() { + return allStructFieldNames.hashCode() ^ allStructFieldTypeInfos.hashCode(); + } + +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..7e51401e5f7bb9ef997ec75f84eaa8b80a30fbe6 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfo.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.typeinfo; + +import java.io.Serializable; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +/** + * Stores information about a type. Always use the TypeInfoFactory to create new + * TypeInfo objects. + * + * We support 8 categories of types: + * 1. Primitive objects (String, Number, etc) + * 2. List objects (a list of objects of a single type) + * 3. Map objects (a map from objects of one type to objects of another type) + * 4. Struct objects (a list of fields with names and their own types) + * 5. Union objects + * 6. Decimal objects + * 7. Char objects + * 8. Varchar objects + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public abstract class TypeInfo implements Serializable { + + private static final long serialVersionUID = 1L; + protected TypeInfo() { + } + + /** + * A String representation of the TypeInfo. + */ + public abstract String getTypeName(); + + /** + * + * @return + */ + public abstract Category getCategory(); + + /** + * String representing the qualified type name. + * Qualified types should override this method. + * @return + */ + public String getQualifiedName() { + return getTypeName(); + } + + @Override + public String toString() { + return getTypeName(); + } + + @Override + public abstract boolean equals(Object o); + + @Override + public abstract int hashCode(); + + public boolean accept(TypeInfo other) { + return this.equals(other); + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoParser.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoParser.java new file mode 100644 index 0000000000000000000000000000000000000000..7bdad93145e049ee39ee46c810d96794cd4220dc --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoParser.java @@ -0,0 +1,343 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.metastore.ColumnType; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * Parse a recursive TypeInfo list String. For example, the following inputs + * are valid inputs: + * "int,string,map,list>>,list>" + * The separators between TypeInfos can be ",", ":", or ";". + * + * In order to use this class: TypeInfoParser parser = new + * TypeInfoParser("int,string"); ArrayList typeInfos = + * parser.parseTypeInfos(); + */ +@LimitedPrivate("Hive") +public class TypeInfoParser { + + private static class Token { + public int position; + public String text; + public boolean isType; + + @Override + public String toString() { + return "" + position + ":" + text; + } + }; + + private static boolean isTypeChar(char c) { + return Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == ' ' || c == '$'; + } + + /** + * Tokenize the typeInfoString. The rule is simple: all consecutive + * alphadigits and '_', '.' are in one token, and all other characters are + * one character per token. + * + * tokenize("map") should return + * ["map","<","int",",","string",">"] + * + * Note that we add '$' in new Calcite return path. As '$' will not appear + * in any type in Hive, it is safe to do so. + */ + private static ArrayList tokenize(String typeInfoString) { + ArrayList tokens = new ArrayList(0); + int begin = 0; + int end = 1; + while (end <= typeInfoString.length()) { + // last character ends a token? + // if there are quotes, all the text between the quotes + // is considered a single token (this can happen for + // timestamp with local time-zone) + if (begin > 0 && + typeInfoString.charAt(begin - 1) == '(' && + typeInfoString.charAt(begin) == '\'') { + // Ignore starting quote + begin++; + do { + end++; + } while (typeInfoString.charAt(end) != '\''); + } else if (typeInfoString.charAt(begin) == '\'' && + typeInfoString.charAt(begin + 1) == ')') { + // Ignore closing quote + begin++; + end++; + } + if (end == typeInfoString.length() + || !isTypeChar(typeInfoString.charAt(end - 1)) + || !isTypeChar(typeInfoString.charAt(end))) { + Token t = new Token(); + t.position = begin; + t.text = typeInfoString.substring(begin, end); + t.isType = isTypeChar(typeInfoString.charAt(begin)); + tokens.add(t); + begin = end; + } + end++; + } + return tokens; + } + + public TypeInfoParser(String typeInfoString, ITypeInfoFactory typeInfoFactory) { + this.typeInfoString = typeInfoString; + this.typeInfoFactory = typeInfoFactory; + typeInfoTokens = tokenize(typeInfoString); + } + + private final String typeInfoString; + private final ArrayList typeInfoTokens; + private final ITypeInfoFactory typeInfoFactory; + private ArrayList typeInfos; + private int iToken; + + public ArrayList parseTypeInfos() { + typeInfos = new ArrayList(); + iToken = 0; + while (iToken < typeInfoTokens.size()) { + typeInfos.add(parseType()); + if (iToken < typeInfoTokens.size()) { + Token separator = typeInfoTokens.get(iToken); + if (",".equals(separator.text) || ";".equals(separator.text) + || ":".equals(separator.text)) { + iToken++; + } else { + throw new IllegalArgumentException( + "Error: ',', ':', or ';' expected at position " + + separator.position + " from '" + typeInfoString + "' " + + typeInfoTokens); + } + } + } + return typeInfos; + } + + private Token peek() { + if (iToken < typeInfoTokens.size()) { + return typeInfoTokens.get(iToken); + } else { + return null; + } + } + + private Token expect(String item) { + return expect(item, null); + } + + private Token expect(String item, String alternative) { + if (iToken >= typeInfoTokens.size()) { + throw new IllegalArgumentException("Error: " + item + + " expected at the end of '" + typeInfoString + "'"); + } + Token t = typeInfoTokens.get(iToken); + if (item.equals("type")) { + if (!ColumnType.LIST_TYPE_NAME.equals(t.text) + && !ColumnType.MAP_TYPE_NAME.equals(t.text) + && !ColumnType.STRUCT_TYPE_NAME.equals(t.text) + && !ColumnType.UNION_TYPE_NAME.equals(t.text) + && !ColumnType.PrimitiveTypes.contains(t.text) + && !t.text.equals(alternative)) { + throw new IllegalArgumentException("Error: " + item + + " expected at the position " + t.position + " of '" + + typeInfoString + "' but '" + t.text + "' is found."); + } + } else if (item.equals("name")) { + if (!t.isType && !t.text.equals(alternative)) { + throw new IllegalArgumentException("Error: " + item + + " expected at the position " + t.position + " of '" + + typeInfoString + "' but '" + t.text + "' is found."); + } + } else { + if (!item.equals(t.text) && !t.text.equals(alternative)) { + throw new IllegalArgumentException("Error: " + item + + " expected at the position " + t.position + " of '" + + typeInfoString + "' but '" + t.text + "' is found."); + } + } + iToken++; + return t; + } + + private String[] parseParams() { + List params = new LinkedList(); + + Token t = peek(); + if (t != null && t.text.equals("(")) { + expect("("); + + // checking for null in the for-loop condition prevents null-ptr exception + // and allows us to fail more gracefully with a parsing error. + for(t = peek(); (t == null) || !t.text.equals(")"); t = expect(",",")")) { + params.add(expect("name").text); + } + if (params.size() == 0) { + throw new IllegalArgumentException( + "type parameters expected for type string " + typeInfoString); + } + } + + return params.toArray(new String[params.size()]); + } + + private TypeInfo parseType() { + + Token t = expect("type"); + + // Is this a primitive type? + if (ColumnType.PrimitiveTypes.contains(t.text)) { + String[] params = parseParams(); + switch (t.text) { + case ColumnType.CHAR_TYPE_NAME: + case ColumnType.VARCHAR_TYPE_NAME: + if (params == null || params.length == 0) { + throw new IllegalArgumentException(t.text + + " type is specified without length: " + typeInfoString); + } + + int length = 1; + if (params.length == 1) { + length = Integer.parseInt(params[0]); + if (ColumnType.VARCHAR_TYPE_NAME.equals(t.text)) { + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.VARCHAR_TYPE_NAME, length); + } else { + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.CHAR_TYPE_NAME, length); + } + } else if (params.length > 1) { + throw new IllegalArgumentException( + "Type " + t.text + " only takes one parameter, but " + + params.length + " is seen"); + } + + case ColumnType.DECIMAL_TYPE_NAME: + //TODO do we need to support this? this works only by luck because + //standalone-metastore depends on storage-api and HiveDecimal happens to be + //in storage-api + int precision = HiveDecimal.USER_DEFAULT_PRECISION; + int scale = HiveDecimal.USER_DEFAULT_SCALE; + if (params == null || params.length == 0) { + // It's possible that old metadata still refers to "decimal" as a column type w/o + // precision/scale. In this case, the default (10,0) is assumed. Thus, do nothing here. + } else if (params.length == 1) { + // only precision is specified + precision = Integer.valueOf(params[0]); + } else if (params.length == 2) { + // New metadata always have two parameters. + precision = Integer.parseInt(params[0]); + scale = Integer.parseInt(params[1]); + } else if (params.length > 2) { + throw new IllegalArgumentException("Type decimal only takes two parameter, but " + + params.length + " is seen"); + } + return typeInfoFactory.getPrimitiveTypeInfo(ColumnType.DECIMAL_TYPE_NAME, precision, scale); + + default: + return typeInfoFactory.getPrimitiveTypeInfo(t.text); + } + } + + // Is this a list type? + if (ColumnType.LIST_TYPE_NAME.equals(t.text)) { + expect("<"); + TypeInfo listElementType = parseType(); + expect(">"); + return typeInfoFactory.getListTypeInfo(listElementType); + } + + // Is this a map type? + if (ColumnType.MAP_TYPE_NAME.equals(t.text)) { + expect("<"); + TypeInfo mapKeyType = parseType(); + expect(","); + TypeInfo mapValueType = parseType(); + expect(">"); + return typeInfoFactory.getMapTypeInfo(mapKeyType, mapValueType); + } + + // Is this a struct type? + if (ColumnType.STRUCT_TYPE_NAME.equals(t.text)) { + ArrayList fieldNames = new ArrayList<>(); + ArrayList fieldTypeInfos = new ArrayList<>(); + boolean first = true; + do { + if (first) { + expect("<"); + first = false; + } else { + Token separator = expect(">", ","); + if (separator.text.equals(">")) { + // end of struct + break; + } + } + Token name = expect("name",">"); + if (name.text.equals(">")) { + break; + } + fieldNames.add(name.text); + expect(":"); + fieldTypeInfos.add(parseType()); + } while (true); + + return typeInfoFactory.getStructTypeInfo(fieldNames, fieldTypeInfos); + } + // Is this a union type? + if (ColumnType.UNION_TYPE_NAME.equals(t.text)) { + List objectTypeInfos = new ArrayList<>(); + boolean first = true; + do { + if (first) { + expect("<"); + first = false; + } else { + Token separator = expect(">", ","); + if (separator.text.equals(">")) { + // end of union + break; + } + } + objectTypeInfos.add(parseType()); + } while (true); + + return typeInfoFactory.getUnionTypeInfo(objectTypeInfos); + } + + throw new RuntimeException("Internal error parsing position " + + t.position + " of '" + typeInfoString + "'"); + } + + public PrimitiveParts parsePrimitiveParts() { + PrimitiveParts parts = new PrimitiveParts(); + Token t = expect("type"); + parts.typeName = t.text; + parts.typeParams = parseParams(); + return parts; + } + + public static class PrimitiveParts { + public String typeName; + public String[] typeParams; + } +} diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/UnionTypeInfo.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/UnionTypeInfo.java new file mode 100644 index 0000000000000000000000000000000000000000..58e218e3502e4da98ac61f33a3ba1edd82a5b59f --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/UnionTypeInfo.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hive.metastore.ColumnType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * UnionTypeInfo represents the TypeInfo of an union. A union holds only one + * field of the specified fields at any point of time. The fields, a Union can + * hold, can have the same or different TypeInfo. + * + * Always use the TypeInfoFactory to create new TypeInfo objects, instead of + * directly creating an instance of this class. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public class UnionTypeInfo extends TypeInfo implements Serializable { + + private static final long serialVersionUID = 1L; + + private List allUnionObjectTypeInfos; + + /** + * For java serialization use only. + */ + public UnionTypeInfo() { + } + + @Override + public String getTypeName() { + StringBuilder sb = new StringBuilder(); + sb.append(ColumnType.UNION_TYPE_NAME + "<"); + for (int i = 0; i < allUnionObjectTypeInfos.size(); i++) { + if (i > 0) { + sb.append(","); + } + sb.append(allUnionObjectTypeInfos.get(i).getTypeName()); + } + sb.append(">"); + return sb.toString(); + } + + /** + * For java serialization use only. + */ + public void setAllUnionObjectTypeInfos( + List allUnionObjectTypeInfos) { + this.allUnionObjectTypeInfos = allUnionObjectTypeInfos; + } + + /** + * For TypeInfoFactory use only. + */ + UnionTypeInfo(List typeInfos) { + allUnionObjectTypeInfos = new ArrayList(); + allUnionObjectTypeInfos.addAll(typeInfos); + } + + @Override + public Category getCategory() { + return ObjectInspector.Category.UNION; + } + + public List getAllUnionObjectTypeInfos() { + return allUnionObjectTypeInfos; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (!(other instanceof UnionTypeInfo)) { + return false; + } + UnionTypeInfo o = (UnionTypeInfo) other; + + // Compare the field types + return o.getAllUnionObjectTypeInfos().equals(getAllUnionObjectTypeInfos()); + } + + @Override + public int hashCode() { + return allUnionObjectTypeInfos.hashCode(); + } +} diff --git a/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/schema/reader/TestDefaultStorageSchemaReader.java b/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/schema/reader/TestDefaultStorageSchemaReader.java new file mode 100644 index 0000000000000000000000000000000000000000..6ba90d64ef85a6c0bbd6a18e4d42b36bcad3354c --- /dev/null +++ b/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/schema/reader/TestDefaultStorageSchemaReader.java @@ -0,0 +1,598 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore.schema.reader; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.DefaultStorageSchemaReader; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.hive.metastore.MetaStoreTestUtils; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.Type; +import org.apache.hadoop.hive.metastore.client.builder.TableBuilder; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.utils.AvroSchemaUtils; +import org.apache.hadoop.util.StringUtils; +import org.apache.thrift.TException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +@Category(MetastoreUnitTest.class) +public class TestDefaultStorageSchemaReader { + private static final Logger LOG = LoggerFactory.getLogger(TestDefaultStorageSchemaReader.class); + private static final String TEST_DB_NAME = "TEST_DB"; + private static final String TEST_TABLE_NAME = "TEST_TABLE"; + private HiveMetaStoreClient client; + private Configuration conf; + private Warehouse warehouse; + private static final int DEFAULT_LIMIT_PARTITION_REQUEST = 100; + private static final String AVRO_SERIALIZATION_LIB = + "org.apache.hadoop.hive.serde2.avro.AvroSerDe"; + + // These schemata are used in other tests + static public final String MAP_WITH_PRIMITIVE_VALUE_TYPE = "{\n" + + " \"namespace\": \"testing\",\n" + + " \"name\": \"oneMap\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"aMap\",\n" + + " \"type\":{\"type\":\"map\",\n" + + " \"values\":\"long\"}\n" + + "\t}\n" + + " ]\n" + + "}"; + static public final String ARRAY_WITH_PRIMITIVE_ELEMENT_TYPE = "{\n" + + " \"namespace\": \"testing\",\n" + + " \"name\": \"oneArray\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"anArray\",\n" + + " \"type\":{\"type\":\"array\",\n" + + " \"items\":\"string\"}\n" + + "\t}\n" + + " ]\n" + + "}"; + public static final String RECORD_SCHEMA = "{\n" + + " \"namespace\": \"testing.test.mctesty\",\n" + + " \"name\": \"oneRecord\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"aRecord\",\n" + + " \"type\":{\"type\":\"record\",\n" + + " \"name\":\"recordWithinARecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"int1\",\n" + + " \"type\":\"int\"\n" + + " },\n" + + " {\n" + + " \"name\":\"boolean1\",\n" + + " \"type\":\"boolean\"\n" + + " },\n" + + " {\n" + + " \"name\":\"long1\",\n" + + " \"type\":\"long\"\n" + + " }\n" + + " ]}\n" + + " }\n" + + " ]\n" + + "}"; + public static final String NULLABLE_RECORD_SCHEMA = "[\"null\", " + RECORD_SCHEMA + "]"; + public static final String UNION_SCHEMA = "{\n" + + " \"namespace\": \"test.a.rossa\",\n" + + " \"name\": \"oneUnion\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"aUnion\",\n" + + " \"type\":[\"int\", \"string\"]\n" + + " }\n" + + " ]\n" + + "}"; + public static final String UNION_SCHEMA_2 = "{\n" + + " \"namespace\": \"test.a.rossa\",\n" + + " \"name\": \"oneUnion\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"aUnion\",\n" + + " \"type\":[\"null\", \"int\", \"string\"]\n" + + " }\n" + + " ]\n" + + "}"; + public static final String UNION_SCHEMA_3 = "{\n" + + " \"namespace\": \"test.a.rossa\",\n" + + " \"name\": \"oneUnion\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"aUnion\",\n" + + " \"type\":[\"float\",\"int\"]\n" + + " }\n" + + " ]\n" + + "}"; + public static final String UNION_SCHEMA_4 = "{\n" + + " \"namespace\": \"test.a.rossa\",\n" + + " \"name\": \"oneUnion\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"aUnion\",\n" + + " \"type\":[\"int\",\"float\",\"long\"]\n" + + " }\n" + + " ]\n" + + "}"; + public static final String ENUM_SCHEMA = "{\n" + + " \"namespace\": \"clever.namespace.name.in.space\",\n" + + " \"name\": \"oneEnum\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"baddies\",\n" + + " \"type\":{\"type\":\"enum\",\"name\":\"villians\", \"symbols\": " + + "[\"DALEKS\", \"CYBERMEN\", \"SLITHEEN\", \"JAGRAFESS\"]}\n" + + " \n" + + " \n" + + " }\n" + + " ]\n" + + "}"; + public static final String FIXED_SCHEMA = "{\n" + + " \"namespace\": \"ecapseman\",\n" + + " \"name\": \"oneFixed\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"hash\",\n" + + " \"type\":{\"type\": \"fixed\", \"name\": \"MD5\", \"size\": 16}\n" + + " }\n" + + " ]\n" + + "}"; + public static final String NULLABLE_STRING_SCHEMA = "{\n" + + " \"type\": \"record\", \n" + + " \"name\": \"nullableUnionTest\",\n" + + " \"fields\" : [\n" + + " {\"name\":\"nullableString\", \"type\":[\"null\", \"string\"]}\n" + + " ]\n" + + "}"; + public static final String MAP_WITH_NULLABLE_PRIMITIVE_VALUE_TYPE_SCHEMA = "{\n" + + " \"namespace\": \"testing\",\n" + + " \"name\": \"mapWithNullableUnionTest\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"aMap\",\n" + + " \"type\":{\"type\":\"map\",\n" + + " \"values\":[\"null\",\"long\"]}\n" + + "\t}\n" + + " ]\n" + + "}"; + public static final String NULLABLE_ENUM_SCHEMA = "{\n" + + " \"namespace\": \"clever.namespace.name.in.space\",\n" + + " \"name\": \"nullableUnionTest\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"nullableEnum\",\n" + + " \"type\": [\"null\", {\"type\":\"enum\",\"name\":\"villians\", \"symbols\": " + + "[\"DALEKS\", \"CYBERMEN\", \"SLITHEEN\", \"JAGRAFESS\"]}]\n" + + " \n" + + " \n" + + " }\n" + + " ]\n" + + "}"; + public static final String BYTES_SCHEMA = "{\n" + + " \"type\": \"record\", \n" + + " \"name\": \"bytesTest\",\n" + + " \"fields\" : [\n" + + " {\"name\":\"bytesField\", \"type\":\"bytes\"}\n" + + " ]\n" + + "}"; + + public static final String KITCHEN_SINK_SCHEMA = "{\n" + + " \"namespace\": \"org.apache.hadoop.hive\",\n" + + " \"name\": \"kitchsink\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\":\"string1\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"string2\",\n" + + " \"type\":\"string\"\n" + + " },\n" + + " {\n" + + " \"name\":\"int1\",\n" + + " \"type\":\"int\"\n" + + " },\n" + + " {\n" + + " \"name\":\"boolean1\",\n" + + " \"type\":\"boolean\"\n" + + " },\n" + + " {\n" + + " \"name\":\"long1\",\n" + + " \"type\":\"long\"\n" + + " },\n" + + " {\n" + + " \"name\":\"float1\",\n" + + " \"type\":\"float\"\n" + + " },\n" + + " {\n" + + " \"name\":\"double1\",\n" + + " \"type\":\"double\"\n" + + " },\n" + + " {\n" + + " \"name\":\"inner_record1\",\n" + + " \"type\":{ \"type\":\"record\",\n" + + " \"name\":\"inner_record1_impl\",\n" + + " \"fields\": [\n" + + " {\"name\":\"int_in_inner_record1\",\n" + + " \"type\":\"int\"},\n" + + " {\"name\":\"string_in_inner_record1\",\n" + + " \"type\":\"string\"}\n" + + " ]\n" + + " }\n" + + " },\n" + + " {\n" + + " \"name\":\"enum1\",\n" + + " \"type\":{\"type\":\"enum\", \"name\":\"enum1_values\", " + + "\"symbols\":[\"ENUM1_VALUES_VALUE1\",\"ENUM1_VALUES_VALUE2\", \"ENUM1_VALUES_VALUE3\"]}\n" + + " },\n" + + " {\n" + + " \"name\":\"array1\",\n" + + " \"type\":{\"type\":\"array\", \"items\":\"string\"}\n" + + " },\n" + + " {\n" + + " \"name\":\"map1\",\n" + + " \"type\":{\"type\":\"map\", \"values\":\"string\"}\n" + + " },\n" + + " {\n" + + " \"name\":\"union1\",\n" + + " \"type\":[\"float\", \"boolean\", \"string\"]\n" + + " },\n" + + " {\n" + + " \"name\":\"fixed1\",\n" + + " \"type\":{\"type\":\"fixed\", \"name\":\"fourbytes\", \"size\":4}\n" + + " },\n" + + " {\n" + + " \"name\":\"null1\",\n" + + " \"type\":\"null\"\n" + + " },\n" + + " {\n" + + " \"name\":\"UnionNullInt\",\n" + + " \"type\":[\"int\", \"null\"]\n" + + " },\n" + + " {\n" + + " \"name\":\"bytes1\",\n" + + " \"type\":\"bytes\"\n" + + " }\n" + + " ]\n" + + "}"; + + @Before + public void setUp() throws Exception { + conf = MetastoreConf.newMetastoreConf(); + warehouse = new Warehouse(conf); + + // set some values to use for getting conf. vars + MetastoreConf.setBoolVar(conf, MetastoreConf.ConfVars.METRICS_ENABLED, true); + conf.set("hive.key1", "value1"); + conf.set("hive.key2", "http://www.example.com"); + conf.set("hive.key3", ""); + conf.set("hive.key4", "0"); + conf.set("datanucleus.autoCreateTables", "false"); + + MetaStoreTestUtils.setConfForStandloneMode(conf); + MetastoreConf.setLongVar(conf, MetastoreConf.ConfVars.BATCH_RETRIEVE_MAX, 2); + MetastoreConf.setLongVar(conf, MetastoreConf.ConfVars.LIMIT_PARTITION_REQUEST, + DEFAULT_LIMIT_PARTITION_REQUEST); + MetastoreConf.setVar(conf, MetastoreConf.ConfVars.STORAGE_SCHEMA_READER_IMPL, + DefaultStorageSchemaReader.class.getName()); + client = createClient(); + } + + @After + public void closeClient() { + client.close(); + } + + private void silentDropDatabase(String dbName) throws TException { + try { + for (String tableName : client.getTables(dbName, "*")) { + client.dropTable(dbName, tableName); + } + client.dropDatabase(dbName); + } catch (NoSuchObjectException | InvalidOperationException e) { + // NOP + } + } + + private HiveMetaStoreClient createClient() throws Exception { + try { + return new HiveMetaStoreClient(conf); + } catch (Throwable e) { + System.err.println("Unable to open the metastore"); + System.err.println(StringUtils.stringifyException(e)); + throw new Exception(e); + } + } + + @Test + public void testSimpleAvroTable() throws TException, IOException { + List fields = new ArrayList<>(2); + FieldSchema field = new FieldSchema(); + field.setName("name"); + field.setType("string"); + field.setComment("Test name comment"); + fields.add(field); + + field = new FieldSchema(); + field.setName("age"); + field.setType("int"); + field.setComment("Test age comment"); + fields.add(field); + + createTable(TEST_DB_NAME, TEST_TABLE_NAME, AVRO_SERIALIZATION_LIB, fields, null); + List retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + verifyTableFields(fields, retFields, null); + } + + private Table createTable(String dbName, String tblName, String serializationLib, + List fields, Map tblProperties) throws TException, IOException { + client.dropTable(dbName, tblName); + silentDropDatabase(dbName); + Database db = new Database(); + db.setName(dbName); + client.createDatabase(db); + db = client.getDatabase(dbName); + Path dbPath = new Path(db.getLocationUri()); + FileSystem fs = FileSystem.get(dbPath.toUri(), conf); + String typeName = "dummy"; + client.dropType(typeName); + Type typ1 = new Type(); + typ1.setName(typeName); + typ1.setFields(fields); + client.createType(typ1); + + Table tbl = new TableBuilder().setDbName(dbName).setTableName(tblName).setCols(typ1.getFields()) + .setSerdeLib(serializationLib).setTableParams(tblProperties).build(); + client.createTable(tbl); + return client.getTable(dbName, tblName); + } + + @Test + public void testExternalSchemaAvroTable() throws TException, IOException { + //map + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, MAP_WITH_PRIMITIVE_VALUE_TYPE); + List retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "aMap", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "map", + retFields.get(0).getType()); + Assert.assertEquals("Unexpected comment of the field", "", retFields.get(0).getComment()); + + //list + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, + ARRAY_WITH_PRIMITIVE_ELEMENT_TYPE); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "anArray", retFields.get(0).getName()); + Assert + .assertEquals("Unexpected type of the field", "array", retFields.get(0).getType()); + + //struct + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, RECORD_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "aRecord", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", + "struct", retFields.get(0).getType()); + + //union + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, UNION_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "aUnion", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "uniontype", + retFields.get(0).getType()); + + //union-2 + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, UNION_SCHEMA_2); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "aUnion", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "uniontype", + retFields.get(0).getType()); + + //union_3 + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, UNION_SCHEMA_3); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "aUnion", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "uniontype", + retFields.get(0).getType()); + + //union_4 + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, UNION_SCHEMA_4); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "aUnion", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "uniontype", + retFields.get(0).getType()); + + //enum + // Enums are one of two Avro types that Hive doesn't have any native support for. + // Column names - we lose the enumness of this schema + // Column types become string + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, ENUM_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "baddies", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "string", + retFields.get(0).getType()); + + // Hive has no concept of Avro's fixed type. Fixed -> arrays of bytes + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, FIXED_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "hash", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "binary", + retFields.get(0).getType()); + + //nullable string + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, NULLABLE_STRING_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "nullableString", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "string", + retFields.get(0).getType()); + + //map with nullable value - That Union[T, NULL] is converted to just T, within a Map + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, MAP_WITH_NULLABLE_PRIMITIVE_VALUE_TYPE_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "aMap", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "map", + retFields.get(0).getType()); + + // That Union[T, NULL] is converted to just T. + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, NULLABLE_ENUM_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "nullableEnum", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "string", + retFields.get(0).getType()); + + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, BYTES_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 1, retFields.size()); + Assert.assertEquals("Unexpected name of the field", "bytesField", retFields.get(0).getName()); + Assert.assertEquals("Unexpected type of the field", "binary", + retFields.get(0).getType()); + + createAvroTableWithExternalSchema(TEST_DB_NAME, TEST_TABLE_NAME, KITCHEN_SINK_SCHEMA); + retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + Assert.assertEquals("Unexpected number of fields", 16, retFields.size()); + //There are 16 fields in this schema. Instead of verifying all we verify the interesting ones + //(ones which have not been tested above) + Assert + .assertEquals("Unexpected name of 8th field", "inner_record1", retFields.get(7).getName()); + Assert.assertEquals("Unexpected type of the field", + "struct", + retFields.get(7).getType()); + + Assert.assertEquals("Unexpected field name of the 10th field", "array1", + retFields.get(9).getName()); + Assert.assertEquals("Unexpected field type of the 10th field", "array", + retFields.get(9).getType()); + + Assert.assertEquals("Unexpected field name of the 11th field", "map1", + retFields.get(10).getName()); + Assert.assertEquals("Unexpected field type of the 11th field", "map", + retFields.get(10).getType()); + + Assert.assertEquals("Unexpected field name of the 12th field", "union1", + retFields.get(11).getName()); + Assert + .assertEquals("Unexpected field type of the 12th field", "uniontype", + retFields.get(11).getType()); + + Assert.assertEquals("Unexpected field name of the 14th field", "null1", + retFields.get(13).getName()); + Assert.assertEquals("Unexpected field type of the 14th field", "void", + retFields.get(13).getType()); + + Assert.assertEquals("Unexpected field name of the 15th field", "UnionNullInt", + retFields.get(14).getName()); + Assert.assertEquals("Unexpected field type of the 15th field", "int", + retFields.get(14).getType()); + } + + private void createAvroTableWithExternalSchema(String dbName, String tblName, String schemaStr) + throws TException, IOException { + List fields = new ArrayList<>(0); + Map tblParams = new HashMap<>(); + tblParams.put(AvroSchemaUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName(), schemaStr); + createTable(dbName, tblName, AVRO_SERIALIZATION_LIB, fields, tblParams); + } + + @Test + public void testSimpleTable() throws TException, IOException { + List fields = new ArrayList<>(2); + FieldSchema field = new FieldSchema(); + field.setName("name"); + field.setType("string"); + field.setComment("Test name comment"); + fields.add(field); + + field = new FieldSchema(); + field.setName("age"); + field.setType("int"); + field.setComment("Test age comment"); + fields.add(field); + + createTable(TEST_DB_NAME, TEST_TABLE_NAME, null, fields, null); + List retFields = client.getFields(TEST_DB_NAME, TEST_TABLE_NAME); + verifyTableFields(fields, retFields, null); + } + + private void verifyTableFields(List expected, List actual, + String nullCommentText) { + Assert.assertEquals(expected.size(), actual.size()); + int size = expected.size(); + for (int i = 0; i < size; i++) { + FieldSchema expectedField = expected.get(i); + FieldSchema actualField = actual.get(i); + Assert.assertEquals("Name does not match for field " + (i + 1), expectedField.getName(), + actualField.getName()); + Assert.assertEquals("Type does not match for field " + (i + 1), expectedField.getType(), + actualField.getType()); + String expectedComment = null; + if (expectedField.getComment() == null && nullCommentText != null) { + expectedComment = nullCommentText; + } else { + expectedComment = expectedField.getComment(); + } + Assert.assertEquals("Comment does not match for field " + (i + 1), expectedComment, + actualField.getComment()); + } + } +} diff --git a/standalone-metastore/src/test/java/org/apache/hadoop/hive/serde2/avro/TestInstanceCache.java b/standalone-metastore/src/test/java/org/apache/hadoop/hive/serde2/avro/TestInstanceCache.java new file mode 100644 index 0000000000000000000000000000000000000000..e7ca47344242befe683f4a649d008a62e7c59dcd --- /dev/null +++ b/standalone-metastore/src/test/java/org/apache/hadoop/hive/serde2/avro/TestInstanceCache.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.avro; + +import java.util.Set; + +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import static org.junit.Assert.assertSame; + +@Category(MetastoreUnitTest.class) +public class TestInstanceCache { + private static class Foo { + + private int value = 42; + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Foo foo = (Foo) o; + + return value == foo.value; + + } + + @Override + public int hashCode() { + return value; + } + } + + private static class Wrapper { + public final T wrapped; + + private Wrapper(T wrapped) { + this.wrapped = wrapped; + } + } + + @Test + public void instanceCachesOnlyCreateOneInstance() throws Exception { + InstanceCache> ic = new InstanceCache>() { + @Override + protected Wrapper makeInstance(Foo hv, + Set seenSchemas) { + return new Wrapper(hv); + } + }; + Foo f1 = new Foo(); + + Wrapper fc = ic.retrieve(f1, null); + assertSame(f1, fc.wrapped); // Our original foo should be in the wrapper + + Foo f2 = new Foo(); // Different instance, same value + + Wrapper fc2 = ic.retrieve(f2, null); + assertSame(fc2,fc); // Since equiv f, should get back first container + assertSame(fc2.wrapped, f1); + } + + @Test + public void instanceCacheReturnsCorrectInstances() throws Exception { + InstanceCache> ic = new InstanceCache>() { + @Override + protected Wrapper makeInstance( + String hv, Set seenSchemas) { + return new Wrapper(hv); + } + }; + + Wrapper one = ic.retrieve("one", null); + Wrapper two = ic.retrieve("two", null); + + Wrapper anotherOne = ic.retrieve("one", null); + assertSame(one, anotherOne); + + Wrapper anotherTwo = ic.retrieve("two", null); + assertSame(two, anotherTwo); + } +} diff --git a/storage-api/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java b/storage-api/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java new file mode 100644 index 0000000000000000000000000000000000000000..1762a8b589e296f24afc18d51f602f1c7b98a533 --- /dev/null +++ b/storage-api/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.objectinspector; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * ObjectInspector helps us to look into the internal structure of a complex + * object. + * + * A (probably configured) ObjectInspector instance stands for a specific type + * and a specific way to store the data of that type in the memory. + * + * For native java Object, we can directly access the internal structure through + * member fields and methods. ObjectInspector is a way to delegate that + * functionality away from the Object, so that we have more control on the + * behavior of those actions. + * + * An efficient implementation of ObjectInspector should rely on factory, so + * that we can make sure the same ObjectInspector only has one instance. That + * also makes sure hashCode() and equals() methods of java.lang.Object directly + * works for ObjectInspector as well. + */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public interface ObjectInspector extends Cloneable { + + /** + * Category. + * + */ + public static enum Category { + PRIMITIVE, LIST, MAP, STRUCT, UNION + }; + + /** + * Returns the name of the data type that is inspected by this + * ObjectInspector. This is used to display the type information to the user. + * + * For primitive types, the type name is standardized. For other types, the + * type name can be something like "list<int>", "map<int,string>", java class + * names, or user-defined type names similar to typedef. + */ + String getTypeName(); + + /** + * An ObjectInspector must inherit from one of the following interfaces if + * getCategory() returns: PRIMITIVE: PrimitiveObjectInspector LIST: + * ListObjectInspector MAP: MapObjectInspector STRUCT: StructObjectInspector. + */ + Category getCategory(); +}