diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java index 72b37f5..0604ac3 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java @@ -176,15 +176,23 @@ private void populateRecord(List r, JsonToken token, JsonParser p, HCatS fpos = s.getPosition(fieldName); } catch (NullPointerException npe) { fpos = getPositionFromHiveInternalColumnName(fieldName); - LOG.debug("NPE finding position for field [{}] in schema [{}]", fieldName, s); + LOG.debug("NPE finding position for field [{}] in schema [{}]," + +" attempting to check if it is an internal column name like _col0", fieldName, s); + if (fpos == -1) { + skipValue(p); + return; // unknown field, we return. We'll continue from the next field onwards. + } + // If we get past this, then the column name did match the hive pattern for an internal + // column name, such as _col0, etc, so it *MUST* match the schema for the appropriate column. + // This means people can't use arbitrary column names such as _c0, and expect us to ignore it + // if we find it. if (!fieldName.equalsIgnoreCase(getHiveInternalColumnName(fpos))) { LOG.error("Hive internal column name {} and position " + "encoding {} for the column name are at odds", fieldName, fpos); throw npe; } - if (fpos == -1) { - return; // unknown field, we return. - } + // If we reached here, then we were successful at finding an alternate internal + // column mapping, and we're about to proceed. } HCatFieldSchema hcatFieldSchema = s.getFields().get(fpos); Object currField = extractCurrentField(p, null, hcatFieldSchema, false); @@ -210,6 +218,27 @@ public int getPositionFromHiveInternalColumnName(String internalName) { } /** + * Utility method to extract (and forget) the next value token from the JsonParser, + * as a whole. The reason this function gets called is to yank out the next value altogether, + * because it corresponds to a field name that we do not recognize, and thus, do not have + * a schema/type for. Thus, this field is to be ignored. + * @throws IOException + * @throws JsonParseException + */ + private void skipValue(JsonParser p) throws JsonParseException, IOException { + JsonToken valueToken = p.nextToken(); + + if ((valueToken == JsonToken.START_ARRAY) || (valueToken == JsonToken.START_OBJECT)){ + // if the currently read token is a beginning of an array or object, move stream forward + // skipping any child tokens till we're at the corresponding END_ARRAY or END_OBJECT token + p.skipChildren(); + } + // At the end of this function, the stream should be pointing to the last token that + // corresponds to the value being skipped. This way, the next call to nextToken + // will advance it to the next field name. + } + + /** * Utility method to extract current expected field from given JsonParser * * To get the field, we need either a type or a hcatFieldSchema(necessary for complex types) diff --git a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java index c8aff6f..0505351 100644 --- a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java +++ b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java @@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -211,4 +212,56 @@ String getInternalNames(String columnNames) { } return sb.toString(); } + + /** + * This test tests that our json deserialization is not too strict, as per HIVE-6166 + * + * i.e, if our schema is "s:struct,k:int", and we pass in + * data that looks like : { + * "x" : "abc" , + * "t" : { + * "a" : "1", + * "b" : "2", + * "c" : [ + * { "x" : 2 , "y" : 3 } , + * { "x" : 3 , "y" : 2 } + * ] + * } , + * "s" : { + * "a" : 2 , + * "b" : "blah", + * "c": "woo" + * } + * } + * + * Then it should still work, and ignore the "x" and "t" field and "c" subfield of "s", and it + * should read k as null. + */ + public void testLooseJsonReadability() throws Exception { + Configuration conf = new Configuration(); + Properties props = new Properties(); + + props.put(serdeConstants.LIST_COLUMNS, "s,k"); + props.put(serdeConstants.LIST_COLUMN_TYPES, "struct,int"); + JsonSerDe rjsd = new JsonSerDe(); + rjsd.initialize(conf, props); + + Text jsonText = new Text("{ \"x\" : \"abc\" , " + + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," + +"\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }"); + List expected = new ArrayList(); + List inner = new ArrayList(); + inner.add(2); + inner.add("blah"); + expected.add(inner); + expected.add(null); + HCatRecord expectedRecord = new DefaultHCatRecord(expected); + + HCatRecord r = (HCatRecord) rjsd.deserialize(jsonText); + System.err.println("record : " + r.toString()); + + assertTrue(HCatDataCheckUtil.recordsEqual(r, expectedRecord)); + + } + }