commit e26c2c883d3bb7d8a9b48609a7c89143fec28baa Author: Mithun RK Date: Fri Aug 4 15:38:03 2017 -0700 HIVE-17181: HCatOutputFormat should expose complete output-schema (including partition-keys) for dynamic-partitioning MR jobs diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/HCatBaseOutputFormat.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/HCatBaseOutputFormat.java index 3e2ed97..6c09e6f 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/HCatBaseOutputFormat.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/HCatBaseOutputFormat.java @@ -43,8 +43,9 @@ /** * Gets the table schema for the table specified in the HCatOutputFormat.setOutput call * on the specified job context. + * Note: This is the record-schema for the table. It does not include the table's partition columns. * @param conf the Configuration object - * @return the table schema + * @return the table schema, excluding partition columns * @throws IOException if HCatOutputFormat.setOutput has not been called for the passed context */ public static HCatSchema getTableSchema(Configuration conf) throws IOException { @@ -53,6 +54,18 @@ public static HCatSchema getTableSchema(Configuration conf) throws IOException { } /** + * Gets the table schema for the table specified in the HCatOutputFormat.setOutput call + * on the specified job context. + * Note: This is the complete table-schema, including the record-schema *and* the partitioning schema. + * @param conf the Configuration object + * @return the table schema, including the record-schema and partitioning schema. + * @throws IOException if HCatOutputFormat.setOutput has not been called for the passed context + */ + public static HCatSchema getTableSchemaWithPartitionColumns(Configuration conf) throws IOException { + return getJobInfo(conf).getTableInfo().getAllColumns(); + } + + /** * Check for validity of the output-specification for the job. * @param context information about the job * @throws IOException when output should not be attempted diff --git a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/mapreduce/TestHCatOutputFormat.java b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/mapreduce/TestHCatOutputFormat.java index d96b385..a4c3b17 100644 --- a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/mapreduce/TestHCatOutputFormat.java +++ b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/mapreduce/TestHCatOutputFormat.java @@ -46,6 +46,7 @@ import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -110,7 +111,6 @@ private void initTable() throws Exception { sd.setCols(Lists.newArrayList(new FieldSchema("data_column", serdeConstants.STRING_TYPE_NAME, ""))); tbl.setSd(sd); - //sd.setLocation("hdfs://tmp"); sd.setInputFormat(RCFileInputFormat.class.getName()); sd.setOutputFormat(RCFileOutputFormat.class.getName()); sd.setParameters(new HashMap()); @@ -138,7 +138,7 @@ private void initTable() throws Exception { public void testSetOutput() throws Exception { Configuration conf = new Configuration(); - Job job = new Job(conf, "test outputformat"); + Job job = Job.getInstance(conf, "test outputformat"); Map partitionValues = new HashMap(); partitionValues.put("colname", "p1"); @@ -157,7 +157,7 @@ public void testSetOutput() throws Exception { publishTest(job); } - public void publishTest(Job job) throws Exception { + private void publishTest(Job job) throws Exception { HCatOutputFormat hcof = new HCatOutputFormat(); TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext( job.getConfiguration(), ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID()); @@ -172,6 +172,41 @@ public void publishTest(Job job) throws Exception { StorerInfo storer = InternalUtil.extractStorerInfo(part.getSd(), part.getParameters()); assertEquals(storer.getProperties().get("hcat.testarg"), "testArgValue"); - assertTrue(part.getSd().getLocation().indexOf("p1") != -1); + assertTrue(part.getSd().getLocation().contains("p1")); + } + + public void testGetTableSchema() throws Exception { + + Configuration conf = new Configuration(); + Job job = Job.getInstance(conf, "test getTableSchema"); + HCatOutputFormat.setOutput( + job, + OutputJobInfo.create( + dbName, + tblName, + new HashMap() {{put("colname", "col_value");}} + ) + ); + + HCatSchema rowSchema = HCatOutputFormat.getTableSchema(job.getConfiguration()); + assertEquals("Row-schema should have exactly one column.", + 1, rowSchema.getFields().size()); + assertEquals("Row-schema must contain the data column.", + "data_column", rowSchema.getFields().get(0).getName()); + assertEquals("Data column should have been STRING type.", + serdeConstants.STRING_TYPE_NAME, rowSchema.getFields().get(0).getTypeString()); + + HCatSchema tableSchema = HCatOutputFormat.getTableSchemaWithPartitionColumns(job.getConfiguration()); + assertEquals("Table-schema should have exactly 2 columns.", + 2, tableSchema.getFields().size()); + assertEquals("Table-schema must contain the data column.", + "data_column", tableSchema.getFields().get(0).getName()); + assertEquals("Data column should have been STRING type.", + serdeConstants.STRING_TYPE_NAME, tableSchema.getFields().get(0).getTypeString()); + assertEquals("Table-schema must contain the partition column.", + "colname", tableSchema.getFields().get(1).getName()); + assertEquals("Partition column should have been STRING type.", + serdeConstants.STRING_TYPE_NAME, tableSchema.getFields().get(1).getTypeString()); + } }