From 8235a983e09191c809e024e16204c35e4095618a Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Tue, 8 Dec 2015 10:08:02 -0800 Subject: [PATCH] HIVE-12643 : For self describing InputFormat don't replicate schema information in partitions --- .../hadoop/hive/metastore/MetaStoreUtils.java | 68 +++++++++++++--------- .../org/apache/hadoop/hive/ql/exec/Utilities.java | 2 +- .../hadoop/hive/ql/optimizer/GenMapRedUtils.java | 6 -- .../hive/ql/optimizer/physical/Vectorizer.java | 16 ++--- .../apache/hadoop/hive/ql/plan/PartitionDesc.java | 14 ++++- .../clientpositive/quotedid_tblproperty.q.out | 4 +- .../tez/vector_partition_diff_num_cols.q.out | 2 + .../vector_partition_diff_num_cols.q.out | 2 + 8 files changed, 68 insertions(+), 46 deletions(-) diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index 6bc882a..84b24ab 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -1012,8 +1012,38 @@ public static Properties getPartSchemaFromTableSchema( return schema; } - public static Properties getSchema( - org.apache.hadoop.hive.metastore.api.StorageDescriptor sd, + public static Properties addCols(Properties schema, List cols) { + + StringBuilder colNameBuf = new StringBuilder(); + StringBuilder colTypeBuf = new StringBuilder(); + StringBuilder colComment = new StringBuilder(); + + boolean first = true; + for (FieldSchema col : cols) { + if (!first) { + colNameBuf.append(","); + colTypeBuf.append(":"); + colComment.append('\0'); + } + colNameBuf.append(col.getName()); + colTypeBuf.append(col.getType()); + colComment.append((null != col.getComment()) ? col.getComment() : ""); + first = false; + } + schema.setProperty( + org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS, + colNameBuf.toString()); + String colTypes = colTypeBuf.toString(); + schema.setProperty( + org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES, + colTypes); + schema.setProperty("columns.comments", colComment.toString()); + + return schema; + + } + + public static Properties getSchemaWithoutCols(org.apache.hadoop.hive.metastore.api.StorageDescriptor sd, org.apache.hadoop.hive.metastore.api.StorageDescriptor tblsd, Map parameters, String databaseName, String tableName, List partitionKeys) { @@ -1063,30 +1093,7 @@ public static Properties getSchema( .getSerdeInfo().getSerializationLib()); } } - StringBuilder colNameBuf = new StringBuilder(); - StringBuilder colTypeBuf = new StringBuilder(); - StringBuilder colComment = new StringBuilder(); - boolean first = true; - for (FieldSchema col : tblsd.getCols()) { - if (!first) { - colNameBuf.append(","); - colTypeBuf.append(":"); - colComment.append('\0'); - } - colNameBuf.append(col.getName()); - colTypeBuf.append(col.getType()); - colComment.append((null != col.getComment()) ? col.getComment() : ""); - first = false; - } - String colNames = colNameBuf.toString(); - String colTypes = colTypeBuf.toString(); - schema.setProperty( - org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS, - colNames); - schema.setProperty( - org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES, - colTypes); - schema.setProperty("columns.comments", colComment.toString()); + if (sd.getCols() != null) { schema.setProperty( org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_DDL, @@ -1130,6 +1137,15 @@ public static Properties getSchema( return schema; } + public static Properties getSchema( + org.apache.hadoop.hive.metastore.api.StorageDescriptor sd, + org.apache.hadoop.hive.metastore.api.StorageDescriptor tblsd, + Map parameters, String databaseName, String tableName, + List partitionKeys) { + + return addCols(getSchemaWithoutCols(sd, tblsd, parameters, databaseName, tableName, partitionKeys), tblsd.getCols()); + } + /** * Convert FieldSchemas to columnNames. */ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 449bef8..7082931 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -708,7 +708,7 @@ public static TableDesc getTableDesc(String cols, String colTypes) { } public static PartitionDesc getPartitionDesc(Partition part) throws HiveException { - return (new PartitionDesc(part)); + return new PartitionDesc(part); } public static PartitionDesc getPartitionDescFromTableDesc(TableDesc tblDesc, Partition part, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index 812af9a..7595065 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -527,9 +527,6 @@ public static void setMapWork(MapWork plan, ParseContext parseCtx, Set props = tsOp.getConf().getOpProps(); if (props != null) { Properties target = aliasPartnDesc.getProperties(); - if (target == null) { - aliasPartnDesc.setProperties(target = new Properties()); - } target.putAll(props); } @@ -668,9 +665,6 @@ public static void setMapWork(MapWork plan, ParseContext parseCtx, Set stack, NodeProcessorCtx procCtx, class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { private final MapWork mWork; - private VectorTaskColumnInfo vectorTaskColumnInfo; + private final VectorTaskColumnInfo vectorTaskColumnInfo; private final boolean isTez; public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez, @@ -1205,9 +1205,9 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { - private VectorTaskColumnInfo vectorTaskColumnInfo; + private final VectorTaskColumnInfo vectorTaskColumnInfo; - private boolean isTez; + private final boolean isTez; private Operator rootVectorOp; @@ -1681,14 +1681,14 @@ private boolean validateExprNodeDescRecursive(ExprNodeDesc desc, VectorExpressio if (desc.getChildren() != null) { if (isInExpression && desc.getChildren().get(0).getTypeInfo().getCategory() == Category.STRUCT) { - // Don't restrict child expressions for projection. + // Don't restrict child expressions for projection. // Always use loose FILTER mode. if (!validateStructInExpression(desc, VectorExpressionDescriptor.Mode.FILTER)) { return false; } } else { for (ExprNodeDesc d : desc.getChildren()) { - // Don't restrict child expressions for projection. + // Don't restrict child expressions for projection. // Always use loose FILTER mode. if (!validateExprNodeDescRecursive(d, VectorExpressionDescriptor.Mode.FILTER)) { return false; @@ -2219,7 +2219,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, if (keySerializerClass != org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe.class) { return false; } - + TableDesc valueTableDesc = desc.getValueSerializeInfo(); Class valueDeserializerClass = valueTableDesc.getDeserializerClass(); if (valueDeserializerClass != org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class) { @@ -2278,7 +2278,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, } else { reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]); } - + vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap); vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos); vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes); @@ -2333,7 +2333,7 @@ private boolean canSpecializeReduceSink(ReduceSinkDesc desc, } } break; - + case REDUCESINK: { VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java index 4d627ef..fe09bdf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java @@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; @@ -85,10 +86,17 @@ public PartitionDesc(final TableDesc table, final LinkedHashMap public PartitionDesc(final Partition part) throws HiveException { PartitionDescConstructorHelper(part, getTableDesc(part.getTable()), true); - setProperties(part.getMetadataFromPartitionSchema()); + if(Utilities.isInputFileFormatSelfDescribing(this)) { + // if IF is self describing no need to send column info per partition, since its not used anyway. + Table tbl = part.getTable(); + setProperties(MetaStoreUtils.getSchemaWithoutCols(part.getTPartition().getSd(), part.getTPartition().getSd(), + part.getParameters(), tbl.getDbName(), tbl.getTableName(), tbl.getPartitionKeys())); + } else { + setProperties(part.getMetadataFromPartitionSchema()); + } } - /** + /** * @param part Partition * @param tblDesc Table Descriptor * @param usePartSchemaProperties Use Partition Schema Properties to set the @@ -190,7 +198,7 @@ public void setOutputFileFormatClass(final Class outputFileFormatClass) { Class outputClass = outputFileFormatClass == null ? null : HiveFileFormatUtils.getOutputFormatSubstitute(outputFileFormatClass); if (outputClass != null) { - this.outputFileFormatClass = (Class) + this.outputFileFormatClass = (Class) CLASS_INTERNER.intern(outputClass); } else { this.outputFileFormatClass = outputClass; diff --git a/ql/src/test/results/clientpositive/quotedid_tblproperty.q.out b/ql/src/test/results/clientpositive/quotedid_tblproperty.q.out index ca1dbe6..3204c7d 100644 --- a/ql/src/test/results/clientpositive/quotedid_tblproperty.q.out +++ b/ql/src/test/results/clientpositive/quotedid_tblproperty.q.out @@ -16,5 +16,5 @@ PREHOOK: Input: default@xyz POSTHOOK: query: describe xyz POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@xyz -valid_colname string -invalid.colname string +key string +value string diff --git a/ql/src/test/results/clientpositive/tez/vector_partition_diff_num_cols.q.out b/ql/src/test/results/clientpositive/tez/vector_partition_diff_num_cols.q.out index f23a359..9b75892 100644 --- a/ql/src/test/results/clientpositive/tez/vector_partition_diff_num_cols.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_partition_diff_num_cols.q.out @@ -368,6 +368,7 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) + Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: @@ -477,6 +478,7 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) + Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: diff --git a/ql/src/test/results/clientpositive/vector_partition_diff_num_cols.q.out b/ql/src/test/results/clientpositive/vector_partition_diff_num_cols.q.out index ef92b89..b224da8 100644 --- a/ql/src/test/results/clientpositive/vector_partition_diff_num_cols.q.out +++ b/ql/src/test/results/clientpositive/vector_partition_diff_num_cols.q.out @@ -346,6 +346,7 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) + Execution mode: vectorized Reduce Operator Tree: Group By Operator aggregations: sum(VALUE._col0) @@ -447,6 +448,7 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) + Execution mode: vectorized Reduce Operator Tree: Group By Operator aggregations: sum(VALUE._col0) -- 1.7.12.4 (Apple Git-37)