From f1370d2238b25b838ec43f0758180ec47c4f6894 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Tue, 8 Dec 2015 10:08:02 -0800 Subject: [PATCH] HIVE-12643 : For self describing InputFormat don't replicate schema information in partitions --- .../hadoop/hive/metastore/MetaStoreUtils.java | 68 +++++++++++++--------- .../apache/hadoop/hive/ql/exec/MapOperator.java | 6 -- .../org/apache/hadoop/hive/ql/exec/Utilities.java | 2 +- .../hadoop/hive/ql/optimizer/GenMapRedUtils.java | 6 -- .../hive/ql/optimizer/physical/Vectorizer.java | 23 +++++--- .../apache/hadoop/hive/ql/plan/PartitionDesc.java | 15 ++++- 6 files changed, 69 insertions(+), 51 deletions(-) diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index 23068f8..89415aa 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -1000,8 +1000,38 @@ public static Properties getPartSchemaFromTableSchema( return schema; } - public static Properties getSchema( - org.apache.hadoop.hive.metastore.api.StorageDescriptor sd, + public static Properties addCols(Properties schema, List cols) { + + StringBuilder colNameBuf = new StringBuilder(); + StringBuilder colTypeBuf = new StringBuilder(); + StringBuilder colComment = new StringBuilder(); + + boolean first = true; + for (FieldSchema col : cols) { + if (!first) { + colNameBuf.append(","); + colTypeBuf.append(":"); + colComment.append('\0'); + } + colNameBuf.append(col.getName()); + colTypeBuf.append(col.getType()); + colComment.append((null != col.getComment()) ? col.getComment() : ""); + first = false; + } + schema.setProperty( + org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS, + colNameBuf.toString()); + String colTypes = colTypeBuf.toString(); + schema.setProperty( + org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES, + colTypes); + schema.setProperty("columns.comments", colComment.toString()); + + return schema; + + } + + public static Properties getSchemaWithoutCols(org.apache.hadoop.hive.metastore.api.StorageDescriptor sd, org.apache.hadoop.hive.metastore.api.StorageDescriptor tblsd, Map parameters, String databaseName, String tableName, List partitionKeys) { @@ -1051,30 +1081,7 @@ public static Properties getSchema( .getSerdeInfo().getSerializationLib()); } } - StringBuilder colNameBuf = new StringBuilder(); - StringBuilder colTypeBuf = new StringBuilder(); - StringBuilder colComment = new StringBuilder(); - boolean first = true; - for (FieldSchema col : tblsd.getCols()) { - if (!first) { - colNameBuf.append(","); - colTypeBuf.append(":"); - colComment.append('\0'); - } - colNameBuf.append(col.getName()); - colTypeBuf.append(col.getType()); - colComment.append((null != col.getComment()) ? col.getComment() : ""); - first = false; - } - String colNames = colNameBuf.toString(); - String colTypes = colTypeBuf.toString(); - schema.setProperty( - org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS, - colNames); - schema.setProperty( - org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES, - colTypes); - schema.setProperty("columns.comments", colComment.toString()); + if (sd.getCols() != null) { schema.setProperty( org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_DDL, @@ -1118,6 +1125,15 @@ public static Properties getSchema( return schema; } + public static Properties getSchema( + org.apache.hadoop.hive.metastore.api.StorageDescriptor sd, + org.apache.hadoop.hive.metastore.api.StorageDescriptor tblsd, + Map parameters, String databaseName, String tableName, + List partitionKeys) { + + return addCols(getSchemaWithoutCols(sd, tblsd, parameters, databaseName, tableName, partitionKeys), tblsd.getCols()); + } + /** * Convert FieldSchemas to columnNames. */ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java index 99724c1..b4dd582 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java @@ -21,7 +21,6 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; @@ -31,16 +30,13 @@ import java.util.Properties; import java.util.Set; import java.util.TreeMap; -import java.util.concurrent.Future; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.hadoop.hive.ql.exec.MapOperator.MapOpCtx; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.io.RecordIdentifier; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.MapWork; @@ -57,14 +53,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.StringUtils; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index c01994f..c698ea8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -703,7 +703,7 @@ public static TableDesc getTableDesc(String cols, String colTypes) { } public static PartitionDesc getPartitionDesc(Partition part) throws HiveException { - return (new PartitionDesc(part)); + return new PartitionDesc(part); } public static PartitionDesc getPartitionDescFromTableDesc(TableDesc tblDesc, Partition part, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index a1c9651..43e8aac 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -526,9 +526,6 @@ public static void setMapWork(MapWork plan, ParseContext parseCtx, Set props = tsOp.getConf().getOpProps(); if (props != null) { Properties target = aliasPartnDesc.getProperties(); - if (target == null) { - aliasPartnDesc.setProperties(target = new Properties()); - } target.putAll(props); } @@ -666,9 +663,6 @@ public static void setMapWork(MapWork plan, ParseContext parseCtx, Set stack, NodeProcessorCtx procCtx, class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { private final MapWork mWork; - private VectorTaskColumnInfo vectorTaskColumnInfo; + private final VectorTaskColumnInfo vectorTaskColumnInfo; private final boolean isTez; public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez, @@ -1057,9 +1062,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { - private VectorTaskColumnInfo vectorTaskColumnInfo; + private final VectorTaskColumnInfo vectorTaskColumnInfo; - private boolean isTez; + private final boolean isTez; private Operator rootVectorOp; @@ -1513,14 +1518,14 @@ private boolean validateExprNodeDescRecursive(ExprNodeDesc desc, VectorExpressio if (desc.getChildren() != null) { if (isInExpression && desc.getChildren().get(0).getTypeInfo().getCategory() == Category.STRUCT) { - // Don't restrict child expressions for projection. + // Don't restrict child expressions for projection. // Always use loose FILTER mode. if (!validateStructInExpression(desc, VectorExpressionDescriptor.Mode.FILTER)) { return false; } } else { for (ExprNodeDesc d : desc.getChildren()) { - // Don't restrict child expressions for projection. + // Don't restrict child expressions for projection. // Always use loose FILTER mode. if (!validateExprNodeDescRecursive(d, VectorExpressionDescriptor.Mode.FILTER)) { return false; @@ -2041,7 +2046,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, if (keySerializerClass != org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe.class) { return false; } - + TableDesc valueTableDesc = desc.getValueSerializeInfo(); Class valueDeserializerClass = valueTableDesc.getDeserializerClass(); if (valueDeserializerClass != org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class) { @@ -2100,7 +2105,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, } else { reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]); } - + vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap); vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos); vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes); @@ -2154,7 +2159,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, } } break; - + case REDUCESINK: { VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java index b032349..9be6790 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java @@ -29,12 +29,14 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.SerDeUtils; @@ -84,10 +86,17 @@ public PartitionDesc(final TableDesc table, final LinkedHashMap public PartitionDesc(final Partition part) throws HiveException { PartitionDescConstructorHelper(part, Utilities.getTableDesc(part.getTable()), true); - setProperties(part.getMetadataFromPartitionSchema()); + if(Utilities.isInputFileFormatSelfDescribing(this)) { + // if IF is self describing no need to send column info per partition, since its not used anyway. + Table tbl = part.getTable(); + setProperties(MetaStoreUtils.getSchemaWithoutCols(part.getTPartition().getSd(), part.getTPartition().getSd(), + part.getParameters(), tbl.getDbName(), tbl.getTableName(), tbl.getPartitionKeys())); + } else { + setProperties(part.getMetadataFromPartitionSchema()); + } } - /** + /** * @param part Partition * @param tblDesc Table Descriptor * @param usePartSchemaProperties Use Partition Schema Properties to set the @@ -189,7 +198,7 @@ public void setOutputFileFormatClass(final Class outputFileFormatClass) { Class outputClass = outputFileFormatClass == null ? null : HiveFileFormatUtils.getOutputFormatSubstitute(outputFileFormatClass); if (outputClass != null) { - this.outputFileFormatClass = (Class) + this.outputFileFormatClass = (Class) CLASS_INTERNER.intern(outputClass); } else { this.outputFileFormatClass = outputClass; -- 1.7.12.4 (Apple Git-37)