diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index fb88c12726..e070df4281 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -470,6 +470,7 @@ minillaplocal.query.files=\ auto_sortmerge_join_7.q,\ auto_sortmerge_join_8.q,\ auto_sortmerge_join_9.q,\ + avro_extschema_insert.q,\ bucket4.q,\ bucket_groupby.q,\ bucket_many.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 333b9839f9..aa62e90c0e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -2139,6 +2139,13 @@ public static String formatBinaryString(byte[] array, int start, int length) { return names; } + /** + * Note: This will not return the correct number of columns in the case of + * Avro serde using an external schema URL, unless these properties have been + * used to initialize the Avro SerDe (which updates these properties). + * @param props TableDesc properties + * @return list of column names based on the table properties + */ public static List getColumnNames(Properties props) { List names = new ArrayList(); String colNames = props.getProperty(serdeConstants.LIST_COLUMNS); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java index 55eb9d8928..9179a3e6e7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java @@ -81,6 +81,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFWhen; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.avro.AvroSerDe; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; @@ -1267,8 +1268,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, Object.. } FileSinkDesc fsdesc = op.getConf(); DynamicPartitionCtx dpCtx = fsdesc.getDynPartCtx(); - if (dpCtx != null) { + // HIVE-22595: For Avro tables with external schema URL, Utilities.getDPColOffset() gives + // wrong results unless AvroSerDe.initialize() is called to update the table properties. + // To be safe just disable this check for Avro tables. + if (dpCtx != null && fsdesc.getTableInfo().getSerdeClassName().equals(AvroSerDe.class.getName())) { // Assume only 1 parent for FS operator Operator parent = op.getParentOperators().get(0); Map parentConstants = cppCtx.getPropagatedConstants(parent); diff --git a/ql/src/test/queries/clientpositive/avro_extschema_insert.q b/ql/src/test/queries/clientpositive/avro_extschema_insert.q new file mode 100644 index 0000000000..c1980713b8 --- /dev/null +++ b/ql/src/test/queries/clientpositive/avro_extschema_insert.q @@ -0,0 +1,20 @@ +set hive.exec.dynamic.partition.mode=nonstrict; + +dfs -cp ${system:hive.root}data/files/table1.avsc ${system:test.tmp.dir}/; + +create external table avro_extschema_insert1 (name string) partitioned by (p1 string) + stored as avro tblproperties ('avro.schema.url'='${system:test.tmp.dir}/table1.avsc'); + +describe avro_extschema_insert1; + +create external table avro_extschema_insert2 like avro_extschema_insert1; + +insert overwrite table avro_extschema_insert1 partition (p1='part1') values ('col1_value', 1, 'col3_value'); + +insert overwrite table avro_extschema_insert2 partition (p1) select * from avro_extschema_insert1; +select * from avro_extschema_insert2; + +dfs -rm ${system:test.tmp.dir}/table1.avsc; + +drop table avro_extschema_insert1; +drop table avro_extschema_insert2; diff --git a/ql/src/test/results/clientpositive/llap/avro_extschema_insert.q.out b/ql/src/test/results/clientpositive/llap/avro_extschema_insert.q.out new file mode 100644 index 0000000000..2976ee3290 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/avro_extschema_insert.q.out @@ -0,0 +1,84 @@ +PREHOOK: query: create external table avro_extschema_insert1 (name string) partitioned by (p1 string) +#### A masked pattern was here #### +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@avro_extschema_insert1 +POSTHOOK: query: create external table avro_extschema_insert1 (name string) partitioned by (p1 string) +#### A masked pattern was here #### +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_extschema_insert1 +PREHOOK: query: describe avro_extschema_insert1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@avro_extschema_insert1 +POSTHOOK: query: describe avro_extschema_insert1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@avro_extschema_insert1 +col1 string +col2 bigint +col3 string +p1 string + +# Partition Information +# col_name data_type comment +p1 string +PREHOOK: query: create external table avro_extschema_insert2 like avro_extschema_insert1 +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@avro_extschema_insert2 +POSTHOOK: query: create external table avro_extschema_insert2 like avro_extschema_insert1 +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@avro_extschema_insert2 +PREHOOK: query: insert overwrite table avro_extschema_insert1 partition (p1='part1') values ('col1_value', 1, 'col3_value') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@avro_extschema_insert1@p1=part1 +POSTHOOK: query: insert overwrite table avro_extschema_insert1 partition (p1='part1') values ('col1_value', 1, 'col3_value') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@avro_extschema_insert1@p1=part1 +POSTHOOK: Lineage: avro_extschema_insert1 PARTITION(p1=part1).col1 SCRIPT [] +POSTHOOK: Lineage: avro_extschema_insert1 PARTITION(p1=part1).col2 SCRIPT [] +POSTHOOK: Lineage: avro_extschema_insert1 PARTITION(p1=part1).col3 SCRIPT [] +PREHOOK: query: insert overwrite table avro_extschema_insert2 partition (p1) select * from avro_extschema_insert1 +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_extschema_insert1 +PREHOOK: Input: default@avro_extschema_insert1@p1=part1 +PREHOOK: Output: default@avro_extschema_insert2 +POSTHOOK: query: insert overwrite table avro_extschema_insert2 partition (p1) select * from avro_extschema_insert1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_extschema_insert1 +POSTHOOK: Input: default@avro_extschema_insert1@p1=part1 +POSTHOOK: Output: default@avro_extschema_insert2@p1=part1 +POSTHOOK: Lineage: avro_extschema_insert2 PARTITION(p1=part1).col1 SIMPLE [(avro_extschema_insert1)avro_extschema_insert1.FieldSchema(name:col1, type:string, comment:), ] +POSTHOOK: Lineage: avro_extschema_insert2 PARTITION(p1=part1).col2 SIMPLE [(avro_extschema_insert1)avro_extschema_insert1.FieldSchema(name:col2, type:bigint, comment:), ] +POSTHOOK: Lineage: avro_extschema_insert2 PARTITION(p1=part1).col3 SIMPLE [(avro_extschema_insert1)avro_extschema_insert1.FieldSchema(name:col3, type:string, comment:), ] +PREHOOK: query: select * from avro_extschema_insert2 +PREHOOK: type: QUERY +PREHOOK: Input: default@avro_extschema_insert2 +PREHOOK: Input: default@avro_extschema_insert2@p1=part1 +#### A masked pattern was here #### +POSTHOOK: query: select * from avro_extschema_insert2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@avro_extschema_insert2 +POSTHOOK: Input: default@avro_extschema_insert2@p1=part1 +#### A masked pattern was here #### +col1_value 1 col3_value part1 +#### A masked pattern was here #### +PREHOOK: query: drop table avro_extschema_insert1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@avro_extschema_insert1 +PREHOOK: Output: default@avro_extschema_insert1 +POSTHOOK: query: drop table avro_extschema_insert1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@avro_extschema_insert1 +POSTHOOK: Output: default@avro_extschema_insert1 +PREHOOK: query: drop table avro_extschema_insert2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@avro_extschema_insert2 +PREHOOK: Output: default@avro_extschema_insert2 +POSTHOOK: query: drop table avro_extschema_insert2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@avro_extschema_insert2 +POSTHOOK: Output: default@avro_extschema_insert2 diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java index f3ef1670e1..905e19b72a 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java @@ -104,11 +104,13 @@ public void initialize(Configuration configuration, Properties properties) throw final String columnCommentProperty = properties.getProperty(LIST_COLUMN_COMMENTS,""); final String columnNameDelimiter = properties.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? properties .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); - + + boolean gotColTypesFromColProps = true; if (hasExternalSchema(properties) || columnNameProperty == null || columnNameProperty.isEmpty() || columnTypeProperty == null || columnTypeProperty.isEmpty()) { schema = determineSchemaOrReturnErrorSchema(configuration, properties); + gotColTypesFromColProps = false; } else { // Get column names and sort order columnNames = StringInternUtils.internStringsInList( @@ -137,6 +139,13 @@ public void initialize(Configuration configuration, Properties properties) throw this.columnNames = StringInternUtils.internStringsInList(aoig.getColumnNames()); this.columnTypes = aoig.getColumnTypes(); this.oi = aoig.getObjectInspector(); + // HIVE-22595: Update the column/type properties to reflect the current, since the + // these properties may be used + if (!gotColTypesFromColProps) { + LOG.info("Updating column name/type properties based on current schema"); + properties.setProperty(serdeConstants.LIST_COLUMNS, String.join(",", columnNames)); + properties.setProperty(serdeConstants.LIST_COLUMN_TYPES, String.join(",", TypeInfoUtils.getTypeStringsFromTypeInfo(columnTypes))); + } if(!badSchema) { this.avroSerializer = new AvroSerializer();