diff --git src/java/org/apache/hcatalog/common/HCatConstants.java src/java/org/apache/hcatalog/common/HCatConstants.java index aa2b762..618fb75 100644 --- src/java/org/apache/hcatalog/common/HCatConstants.java +++ src/java/org/apache/hcatalog/common/HCatConstants.java @@ -38,6 +38,10 @@ public final class HCatConstants { public static final String HCAT_PIG_ARGS_DELIMIT = "hcat.pig.args.delimiter"; public static final String HCAT_PIG_ARGS_DELIMIT_DEFAULT = ","; public static final String HCAT_PIG_STORER_LOCATION_SET = HCAT_PIG_STORER + ".location.set" ; + public static final String HCAT_PIG_INNER_TUPLE_NAME = "hcat.pig.inner.tuple.name"; + public static final String HCAT_PIG_INNER_TUPLE_NAME_DEFAULT = "innertuple"; + public static final String HCAT_PIG_INNER_FIELD_NAME = "hcat.pig.inner.field.name"; + public static final String HCAT_PIG_INNER_FIELD_NAME_DEFAULT = "innerfield"; //The keys used to store info into the job Configuration public static final String HCAT_KEY_BASE = "mapreduce.lib.hcat"; diff --git src/java/org/apache/hcatalog/pig/PigHCatUtil.java src/java/org/apache/hcatalog/pig/PigHCatUtil.java index 3ef5763..696081f 100644 --- src/java/org/apache/hcatalog/pig/PigHCatUtil.java +++ src/java/org/apache/hcatalog/pig/PigHCatUtil.java @@ -194,14 +194,27 @@ public class PigHCatUtil { return rfSchema; } - private static ResourceSchema getBagSubSchema(HCatFieldSchema hfs) throws IOException { + protected static ResourceSchema getBagSubSchema(HCatFieldSchema hfs) throws IOException { // there are two cases - array and array> // in either case the element type of the array is represented in a // tuple field schema in the bag's field schema - the second case (struct) // more naturally translates to the tuple - in the first case (array) // we simulate the tuple by putting the single field in a tuple + + Properties props = UDFContext.getUDFContext().getClientSystemProps(); + String innerTupleName = HCatConstants.HCAT_PIG_INNER_TUPLE_NAME_DEFAULT; + if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME)) { + innerTupleName = props.getProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME) + .replaceAll("FIELDNAME", hfs.getName()); + } + String innerFieldName = HCatConstants.HCAT_PIG_INNER_FIELD_NAME_DEFAULT; + if (props != null && props.containsKey(HCatConstants.HCAT_PIG_INNER_FIELD_NAME)) { + innerFieldName = props.getProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME) + .replaceAll("FIELDNAME", hfs.getName()); + } + ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1]; - bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("innertuple") + bagSubFieldSchemas[0] = new ResourceFieldSchema().setName(innerTupleName) .setDescription("The tuple in the bag") .setType(DataType.TUPLE); HCatFieldSchema arrayElementFieldSchema = hfs.getArrayElementSchema().get(0); @@ -214,7 +227,7 @@ public class PigHCatUtil { bagSubFieldSchemas[0].setSchema(s); } else { ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1]; - innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("innerfield") + innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName(innerFieldName) .setDescription("The inner field in the tuple in the bag") .setType(getPigType(arrayElementFieldSchema)) .setSchema(null); // the element type is not a tuple - so no subschema diff --git src/java/org/apache/hcatalog/pig/TestPigHCatUtil.java src/java/org/apache/hcatalog/pig/TestPigHCatUtil.java new file mode 100644 index 0000000..6ad08eb --- /dev/null +++ src/java/org/apache/hcatalog/pig/TestPigHCatUtil.java @@ -0,0 +1,72 @@ +package org.apache.hcatalog.pig; + +import com.google.common.collect.Lists; +import junit.framework.Assert; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.pig.ResourceSchema; +import org.apache.pig.ResourceSchema.ResourceFieldSchema; +import org.apache.pig.data.DataType; +import org.apache.pig.impl.util.UDFContext; +import org.junit.Test; + +public class TestPigHCatUtil { + + @Test + public void testGetBagSubSchema() throws Exception { + + // Define the expected schema. + ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1]; + bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("innertuple") + .setDescription("The tuple in the bag").setType(DataType.TUPLE); + + ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1]; + innerTupleFieldSchemas[0] = + new ResourceFieldSchema().setName("innerfield").setType(DataType.CHARARRAY); + + bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas)); + ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas); + + // Get the actual converted schema. + HCatSchema hCatSchema = new HCatSchema(Lists.newArrayList( + new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null))); + HCatFieldSchema hCatFieldSchema = + new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, hCatSchema, null); + ResourceSchema actual = PigHCatUtil.getBagSubSchema(hCatFieldSchema); + + Assert.assertEquals(expected.toString(), actual.toString()); + } + + @Test + public void testGetBagSubSchemaConfigured() throws Exception { + + // NOTE: pig-0.8 sets client system properties by actually getting the client + // system properties. Starting in pig-0.9 you must pass the properties in. + // When updating our pig dependency this will need updated. + System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t"); + System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple"); + UDFContext.getUDFContext().setClientSystemProps(); + + // Define the expected schema. + ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1]; + bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("t") + .setDescription("The tuple in the bag").setType(DataType.TUPLE); + + ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1]; + innerTupleFieldSchemas[0] = + new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY); + + bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas)); + ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas); + + // Get the actual converted schema. + HCatSchema actualHCatSchema = new HCatSchema(Lists.newArrayList( + new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null))); + HCatFieldSchema actualHCatFieldSchema = + new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null); + ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema); + + Assert.assertEquals(expected.toString(), actual.toString()); + } +}